diff --git a/examples/gpt-2/CMakeLists.txt b/examples/gpt-2/CMakeLists.txt index 2307a7dd93..6ddada0619 100644 --- a/examples/gpt-2/CMakeLists.txt +++ b/examples/gpt-2/CMakeLists.txt @@ -18,6 +18,11 @@ target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) if (GGML_CUBLAS) add_compile_definitions(GGML_USE_CUBLAS) endif() + if (GGML_CLBLAST) add_compile_definitions(GGML_USE_CLBLAST) endif() + +if (GGML_METAL) + add_compile_definitions(GGML_USE_METAL) +endif() diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp index e7bab0ba1d..25725a1d1c 100644 --- a/examples/gpt-2/main.cpp +++ b/examples/gpt-2/main.cpp @@ -6,6 +6,10 @@ #include "ggml-cuda.h" #endif +#ifdef GGML_USE_METAL +#include "ggml-metal.h" +#endif + #include "common.h" #include "common-ggml.h" @@ -22,6 +26,13 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) { + (void) level; + (void) user_data; + fputs(text, stderr); + fflush(stderr); +} + // default hparams (GPT-2 117M) struct gpt2_hparams { int32_t n_vocab = 50257; @@ -234,6 +245,17 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & } #endif +#ifdef GGML_USE_METAL + if (n_gpu_layers > 0) { + fprintf(stderr, "%s: using Metal backend\n", __func__); + ggml_metal_log_set_callback(ggml_log_callback_default, nullptr); + model.backend = ggml_backend_metal_init(); + if (!model.backend) { + fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); + } + } +#endif + if (!model.backend) { // fallback to CPU backend fprintf(stderr, "%s: using CPU backend\n", __func__); @@ -521,9 +543,8 @@ struct ggml_cgraph * gpt2_graph( // [ 768, N] cur = ggml_add(ctx0, ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].ln_1_g, cur), - cur), - //ggml_repeat(ctx0, model.layers[il].ln_1_b, cur)); + cur, + model.layers[il].ln_1_g), model.layers[il].ln_1_b); } @@ -541,8 +562,8 @@ struct ggml_cgraph * gpt2_graph( cur); cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur), - cur); + cur, + model.layers[il].c_attn_attn_b); } // self-attention @@ -649,8 +670,8 @@ struct ggml_cgraph * gpt2_graph( cur); cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), - cur); + cur, + model.layers[il].c_attn_proj_b); } // add the input @@ -668,9 +689,8 @@ struct ggml_cgraph * gpt2_graph( // [ 768, N] cur = ggml_add(ctx0, ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].ln_2_g, cur), - cur), - //ggml_repeat(ctx0, model.layers[il].ln_2_b, cur)); + cur, + model.layers[il].ln_2_g), model.layers[il].ln_2_b); } @@ -687,8 +707,8 @@ struct ggml_cgraph * gpt2_graph( cur); cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur), - cur); + cur, + model.layers[il].c_mlp_fc_b); // GELU activation // [3072, N] @@ -707,8 +727,8 @@ struct ggml_cgraph * gpt2_graph( cur); cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur), - cur); + cur, + model.layers[il].c_mlp_proj_b); } // input for next layer @@ -724,9 +744,8 @@ struct ggml_cgraph * gpt2_graph( // [ 768, N] inpL = ggml_add(ctx0, ggml_mul(ctx0, - ggml_repeat(ctx0, model.ln_f_g, inpL), - inpL), - //ggml_repeat(ctx0, model.ln_f_b, inpL)); + inpL, + model.ln_f_g), model.ln_f_b); } @@ -778,6 +797,8 @@ bool gpt2_eval( // run the computation if (ggml_backend_is_cpu(model.backend)) { ggml_backend_cpu_set_n_threads(model.backend, n_threads); + } else if (ggml_backend_is_metal(model.backend)) { + ggml_backend_metal_set_n_threads(model.backend, n_threads); } ggml_backend_graph_compute(model.backend, gf); diff --git a/include/ggml/ggml-backend.h b/include/ggml/ggml-backend.h index 606ea5e4d1..36457e9910 100644 --- a/include/ggml/ggml-backend.h +++ b/include/ggml/ggml-backend.h @@ -132,14 +132,17 @@ extern "C" { GGML_API ggml_backend_t ggml_backend_cpu_init(void); - GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend); - GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads); GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); /////////////////////////// + // TODO: we should probably do something better here + GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend); + GGML_API bool ggml_backend_is_cuda (ggml_backend_t backend); + GGML_API bool ggml_backend_is_metal(ggml_backend_t backend); + #if 0 // graph splitting #define GGML_MAX_SPLITS 200 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index bcfb4b23bf..b225597eda 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -229,7 +229,7 @@ if (GGML_METAL) set(GGML_METAL_SOURCES ggml-metal.m ggml-metal.h) add_compile_definitions(GGML_USE_METAL) - add_compile_definitions(GGML_METAL_NDEBUG) + #add_compile_definitions(GGML_METAL_NDEBUG) # get full path to the file #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/") diff --git a/src/ggml-backend.c b/src/ggml-backend.c index c5bc032809..187a149c4f 100644 --- a/src/ggml-backend.c +++ b/src/ggml-backend.c @@ -369,10 +369,6 @@ ggml_backend_t ggml_backend_cpu_init(void) { return cpu_backend; } -bool ggml_backend_is_cpu(ggml_backend_t backend) { - return backend->interface.get_name == ggml_backend_cpu_name; -} - void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); @@ -385,6 +381,18 @@ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) return ggml_backend_buffer_init(NULL, cpu_backend_buffer_i_from_ptr, ptr, size); } +bool ggml_backend_is_cpu(ggml_backend_t backend) { + return backend->interface.get_name == ggml_backend_cpu_name; +} + +bool ggml_backend_is_cuda(ggml_backend_t backend) { + return strcmp(ggml_backend_name(backend), "CUDA") == 0; +} + +bool ggml_backend_is_metal(ggml_backend_t backend) { + return strcmp(ggml_backend_name(backend), "Metal") == 0; +} + #if 0 // splits diff --git a/src/ggml-cuda.h b/src/ggml-cuda.h index 81ee9a2e94..57adc9cf34 100644 --- a/src/ggml-cuda.h +++ b/src/ggml-cuda.h @@ -46,7 +46,6 @@ GGML_API void ggml_cuda_get_device_description(int device, char * description, // backend API GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use - #ifdef __cplusplus } #endif diff --git a/src/ggml-metal.h b/src/ggml-metal.h index 790cf0bf7b..bc6773a6e6 100644 --- a/src/ggml-metal.h +++ b/src/ggml-metal.h @@ -20,6 +20,7 @@ #pragma once #include "ggml.h" +#include "ggml-backend.h" #include #include @@ -35,10 +36,15 @@ struct ggml_cgraph; extern "C" { #endif -void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data); +// +// internal API +// temporary exposed to user-code +// struct ggml_metal_context; +void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data); + // number of command buffers to use struct ggml_metal_context * ggml_metal_init(int n_cb); void ggml_metal_free(struct ggml_metal_context * ctx); @@ -83,6 +89,15 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx); // creates gf->n_threads command buffers in parallel void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); +// +// backend API +// user-code should use only these functions +// + +GGML_API ggml_backend_t ggml_backend_metal_init(void); + +GGML_API void ggml_backend_metal_set_n_threads(ggml_backend_t backend, int n_threads); + #ifdef __cplusplus } #endif diff --git a/src/ggml-metal.m b/src/ggml-metal.m index 866fed4344..a06b738a19 100644 --- a/src/ggml-metal.m +++ b/src/ggml-metal.m @@ -151,8 +151,6 @@ static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){ } } - - struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_LOG_INFO("%s: allocating\n", __func__); @@ -1371,3 +1369,136 @@ void ggml_metal_graph_compute( } } + +//////////////////////////////////////////////////////////////////////////////// + +// backend interface + +static const char * ggml_backend_metal_name(ggml_backend_t backend) { + return "Metal"; + + UNUSED(backend); +} + +static void ggml_backend_metal_free(ggml_backend_t backend) { + struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; + ggml_metal_free(ctx); + free(backend); +} + +static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { + return (void *)buffer->context; +} + +static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { + free(buffer->context); + UNUSED(buffer); +} + +static struct ggml_backend_buffer_i metal_backend_buffer_i = { + /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer, + /* .get_base = */ ggml_backend_metal_buffer_get_base, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .init_tensor = */ NULL, // no initialization required + /* .free_tensor = */ NULL, // no cleanup required +}; + +static ggml_backend_buffer_t ggml_backend_metal_alloc_buffer(ggml_backend_t backend, size_t size) { + struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; + + void * data = ggml_metal_host_malloc(size); + + // TODO: set proper name of the buffers + ggml_metal_add_buffer(ctx, "backend", data, size, 0); + + return ggml_backend_buffer_init(backend, metal_backend_buffer_i, data, size); +} + +static size_t ggml_backend_metal_get_alignment(ggml_backend_t backend) { + return 32; + UNUSED(backend); +} + +static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + + memcpy((char *)tensor->data + offset, data, size); + + UNUSED(backend); +} + +static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + + memcpy(data, (const char *)tensor->data + offset, size); + + UNUSED(backend); +} + +static void ggml_backend_metal_synchronize(ggml_backend_t backend) { + UNUSED(backend); +} + +static void ggml_backend_metal_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { + ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); + + UNUSED(backend); +} + +static void ggml_backend_metal_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { + ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src)); + + UNUSED(backend); +} + +static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; + + ggml_metal_graph_compute(metal_ctx, cgraph); +} + +static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { + return true; + UNUSED(backend); + UNUSED(op); +} + +static struct ggml_backend_i metal_backend_i = { + /* .get_name = */ ggml_backend_metal_name, + /* .free = */ ggml_backend_metal_free, + /* .alloc_buffer = */ ggml_backend_metal_alloc_buffer, + /* .get_alignment = */ ggml_backend_metal_get_alignment, + /* .set_tensor_async = */ ggml_backend_metal_set_tensor_async, + /* .get_tensor_async = */ ggml_backend_metal_get_tensor_async, + /* .synchronize = */ ggml_backend_metal_synchronize, + /* .cpy_tensor_from = */ ggml_backend_metal_cpy_tensor_from, + /* .cpy_tensor_to = */ ggml_backend_metal_cpy_tensor_to, + /* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm + /* .graph_plan_free = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_metal_graph_compute, + /* .supports_op = */ ggml_backend_metal_supports_op, +}; + +ggml_backend_t ggml_backend_metal_init(void) { + struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); + + ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS); + + ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend)); + + *metal_backend = (struct ggml_backend) { + /* .interface = */ metal_backend_i, + /* .context = */ ctx, + }; + + return metal_backend; +} + +void ggml_backend_metal_set_n_threads(ggml_backend_t backend, int n_threads) { + struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; + + ggml_metal_set_n_cb(ctx, n_threads); +}