From 35fbd712298c51c693e8a04b51b9fba3759080bc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 5 Oct 2023 16:16:25 +0300 Subject: [PATCH 1/5] ggml-backend : metal (WIP) --- examples/gpt-2/CMakeLists.txt | 5 + examples/gpt-2/main.cpp | 14 +++ src/ggml-cuda.h | 1 - src/ggml-metal.h | 31 +++--- src/ggml-metal.m | 187 ++++++++++++++++++++++++++++++++++ 5 files changed, 224 insertions(+), 14 deletions(-) diff --git a/examples/gpt-2/CMakeLists.txt b/examples/gpt-2/CMakeLists.txt index 2307a7dd93..6ddada0619 100644 --- a/examples/gpt-2/CMakeLists.txt +++ b/examples/gpt-2/CMakeLists.txt @@ -18,6 +18,11 @@ target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) if (GGML_CUBLAS) add_compile_definitions(GGML_USE_CUBLAS) endif() + if (GGML_CLBLAST) add_compile_definitions(GGML_USE_CLBLAST) endif() + +if (GGML_METAL) + add_compile_definitions(GGML_USE_METAL) +endif() diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp index e7bab0ba1d..e040ccbae0 100644 --- a/examples/gpt-2/main.cpp +++ b/examples/gpt-2/main.cpp @@ -6,6 +6,10 @@ #include "ggml-cuda.h" #endif +#ifdef GGML_USE_METAL +#include "ggml-metal.h" +#endif + #include "common.h" #include "common-ggml.h" @@ -234,6 +238,16 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & } #endif +#ifdef GGML_USE_METAL + if (n_gpu_layers > 0) { + fprintf(stderr, "%s: using Metal backend\n", __func__); + model.backend = ggml_backend_metal_init(); + if (!model.backend) { + fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); + } + } +#endif + if (!model.backend) { // fallback to CPU backend fprintf(stderr, "%s: using CPU backend\n", __func__); diff --git a/src/ggml-cuda.h b/src/ggml-cuda.h index 81ee9a2e94..57adc9cf34 100644 --- a/src/ggml-cuda.h +++ b/src/ggml-cuda.h @@ -46,7 +46,6 @@ GGML_API void ggml_cuda_get_device_description(int device, char * description, // backend API GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use - #ifdef __cplusplus } #endif diff --git a/src/ggml-metal.h b/src/ggml-metal.h index 790cf0bf7b..ac93716b5d 100644 --- a/src/ggml-metal.h +++ b/src/ggml-metal.h @@ -20,6 +20,7 @@ #pragma once #include "ggml.h" +#include "ggml-backend.h" #include #include @@ -35,19 +36,19 @@ struct ggml_cgraph; extern "C" { #endif -void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data); +GGML_API void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data); struct ggml_metal_context; // number of command buffers to use -struct ggml_metal_context * ggml_metal_init(int n_cb); -void ggml_metal_free(struct ggml_metal_context * ctx); +GGML_API struct ggml_metal_context * ggml_metal_init(int n_cb); +GGML_API void ggml_metal_free(struct ggml_metal_context * ctx); -void * ggml_metal_host_malloc(size_t n); -void ggml_metal_host_free (void * data); +GGML_API void * ggml_metal_host_malloc(size_t n); +GGML_API void ggml_metal_host_free (void * data); // set the number of command buffers to use -void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb); +GGML_API void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb); // creates a mapping between a host memory buffer and a device memory buffer // - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute @@ -56,7 +57,7 @@ void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb); // - max_size specifies the maximum size of a tensor and is used to create shared views such // that it is guaranteed that the tensor will fit in at least one of the views // -bool ggml_metal_add_buffer( +GGML_API bool ggml_metal_add_buffer( struct ggml_metal_context * ctx, const char * name, void * data, @@ -64,24 +65,28 @@ bool ggml_metal_add_buffer( size_t max_size); // set data from host memory into the device -void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); +GGML_API void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); // get data from the device into host memory -void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); +GGML_API void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); // try to find operations that can be run concurrently in the graph // you should run it again if the topology of your graph changes -void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem); +GGML_API void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem); // if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized -int ggml_metal_if_optimized(struct ggml_metal_context * ctx); +GGML_API int ggml_metal_if_optimized(struct ggml_metal_context * ctx); // output the concur_list for ggml_alloc -int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx); +GGML_API int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx); // same as ggml_graph_compute but uses Metal // creates gf->n_threads command buffers in parallel -void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); +GGML_API void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); + +// backend API +GGML_API ggml_backend_t ggml_backend_metal_init(void); +GGML_API void ggml_backend_metal_set_n_threads(ggml_backend_t backend_metal, int n_threads); #ifdef __cplusplus } diff --git a/src/ggml-metal.m b/src/ggml-metal.m index 866fed4344..010207d590 100644 --- a/src/ggml-metal.m +++ b/src/ggml-metal.m @@ -1371,3 +1371,190 @@ void ggml_metal_graph_compute( } } + +//////////////////////////////////////////////////////////////////////////////// + +// backend interface + +#define UNUSED GGML_UNUSED + +static const char * ggml_backend_metal_name(ggml_backend_t backend) { + return "Metal"; + + UNUSED(backend); +} + +static void ggml_backend_metal_free(ggml_backend_t backend) { + struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; + ggml_metal_free(metal_ctx); + free(backend); +} + +static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { + return (void *)buffer->context; +} + +static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { + free(buffer->context); + UNUSED(buffer); +} + +static struct ggml_backend_buffer_i metal_backend_buffer_i = { + /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer, + /* .get_base = */ ggml_backend_metal_buffer_get_base, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .init_tensor = */ NULL, // no initialization required + /* .free_tensor = */ NULL, // no cleanup required +}; + +// for buffers from ptr, free is not called +static struct ggml_backend_buffer_i metal_backend_buffer_i_from_ptr = { + /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed + /* .get_base = */ ggml_backend_metal_buffer_get_base, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .init_tensor = */ NULL, + /* .free_tensor = */ NULL, +}; + +static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 + +static ggml_backend_buffer_t ggml_backend_metal_alloc_buffer(ggml_backend_t backend, size_t size) { + void * data = malloc(size + TENSOR_ALIGNMENT); // malloc may return an address that is not aligned + // TODO: maybe use GGML_ALIGNED_MALLOC? + return ggml_backend_buffer_init(backend, metal_backend_buffer_i, data, size); +} + +static size_t ggml_backend_metal_get_alignment(ggml_backend_t backend) { + return TENSOR_ALIGNMENT; + UNUSED(backend); +} + +static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + + memcpy((char *)tensor->data + offset, data, size); + + UNUSED(backend); +} + +static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + + memcpy(data, (const char *)tensor->data + offset, size); + + UNUSED(backend); +} + +static void ggml_backend_metal_synchronize(ggml_backend_t backend) { + UNUSED(backend); +} + +static void ggml_backend_metal_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { + ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); + + UNUSED(backend); +} + +static void ggml_backend_metal_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { + // for a backend such as CUDA that can queue async calls, it is ok to do this asynchronously, but it may not be the case for other backends + ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src)); + + UNUSED(backend); +} + +struct ggml_backend_plan_metal { + struct ggml_cplan cplan; + struct ggml_cgraph cgraph; +}; + +static ggml_backend_graph_plan_t ggml_backend_metal_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; + + struct ggml_backend_plan_metal * metal_plan = malloc(sizeof(struct ggml_backend_plan_metal)); + + metal_plan->cplan = ggml_graph_plan(cgraph, 1); + metal_plan->cgraph = *cgraph; + + if (metal_plan->cplan.work_size > 0) { + metal_plan->cplan.work_data = malloc(metal_plan->cplan.work_size); + } + + return metal_plan; +} + +static void ggml_backend_metal_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + struct ggml_backend_plan_metal * metal_plan = (struct ggml_backend_plan_metal *)plan; + + free(metal_plan->cplan.work_data); + free(metal_plan); + + UNUSED(backend); +} + +static void ggml_backend_metal_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + struct ggml_backend_plan_metal * metal_plan = (struct ggml_backend_plan_metal *)plan; + + ggml_graph_compute(&metal_plan->cgraph, &metal_plan->cplan); + + UNUSED(backend); +} + +static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; + + struct ggml_cplan cplan = ggml_graph_plan(cgraph, metal_ctx->n_threads); + + if (metal_ctx->work_size < cplan.work_size) { + // TODO: may be faster to free and use malloc to avoid the copy + metal_ctx->work_data = realloc(metal_ctx->work_data, cplan.work_size); + metal_ctx->work_size = cplan.work_size; + } + + cplan.work_data = metal_ctx->work_data; + + ggml_graph_compute(cgraph, &cplan); +} + +static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { + return true; + UNUSED(backend); + UNUSED(op); +} + +static struct ggml_backend_i metal_backend_i = { + /* .get_name = */ ggml_backend_metal_name, + /* .free = */ ggml_backend_metal_free, + /* .alloc_buffer = */ ggml_backend_metal_alloc_buffer, + /* .get_alignment = */ ggml_backend_metal_get_alignment, + /* .set_tensor_async = */ ggml_backend_metal_set_tensor_async, + /* .get_tensor_async = */ ggml_backend_metal_get_tensor_async, + /* .synchronize = */ ggml_backend_metal_synchronize, + /* .cpy_tensor_from = */ ggml_backend_metal_cpy_tensor_from, + /* .cpy_tensor_to = */ ggml_backend_metal_cpy_tensor_to, + /* .graph_plan_create = */ ggml_backend_metal_graph_plan_create, + /* .graph_plan_free = */ ggml_backend_metal_graph_plan_free, + /* .graph_plan_compute = */ ggml_backend_metal_graph_plan_compute, + /* .graph_compute = */ ggml_backend_metal_graph_compute, + /* .supports_op = */ ggml_backend_metal_supports_op, +}; + +ggml_backend_t ggml_backend_metal_init(void) { + struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); + + ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend)); + + *metal_backend = (struct ggml_backend) { + /* .interface = */ metal_backend_i, + /* .context = */ ctx, + }; + + return metal_backend; +} + +void ggml_backend_metal_set_n_threads(ggml_backend_t backend_metal, int n_threads) { + struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend_metal->context; + + ggml_metal_set_n_cb(ctx, n_threads); +} From 50d00ee5350a6eb42788424b5da117e8df76c79e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 5 Oct 2023 16:27:55 +0300 Subject: [PATCH 2/5] ggml-backend : metal (adapt CPU backend) --- src/ggml-metal.h | 1 - src/ggml-metal.m | 69 ++++++++++++++++++++---------------------------- 2 files changed, 29 insertions(+), 41 deletions(-) diff --git a/src/ggml-metal.h b/src/ggml-metal.h index ac93716b5d..250a5ef81a 100644 --- a/src/ggml-metal.h +++ b/src/ggml-metal.h @@ -86,7 +86,6 @@ GGML_API void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct g // backend API GGML_API ggml_backend_t ggml_backend_metal_init(void); -GGML_API void ggml_backend_metal_set_n_threads(ggml_backend_t backend_metal, int n_threads); #ifdef __cplusplus } diff --git a/src/ggml-metal.m b/src/ggml-metal.m index 010207d590..327fa86176 100644 --- a/src/ggml-metal.m +++ b/src/ggml-metal.m @@ -1378,6 +1378,15 @@ void ggml_metal_graph_compute( #define UNUSED GGML_UNUSED +struct ggml_backend_metal_context { + int n_threads; + + void * work_data; + size_t work_size; + + struct ggml_metal_context * ctx; +}; + static const char * ggml_backend_metal_name(ggml_backend_t backend) { return "Metal"; @@ -1385,8 +1394,10 @@ void ggml_metal_graph_compute( } static void ggml_backend_metal_free(ggml_backend_t backend) { - struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; - ggml_metal_free(metal_ctx); + struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context; + ggml_metal_free(metal_ctx->ctx); + free(metal_ctx->work_data); + free(metal_ctx); free(backend); } @@ -1407,25 +1418,16 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) /* .free_tensor = */ NULL, // no cleanup required }; -// for buffers from ptr, free is not called -static struct ggml_backend_buffer_i metal_backend_buffer_i_from_ptr = { - /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed - /* .get_base = */ ggml_backend_metal_buffer_get_base, - /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .init_tensor = */ NULL, - /* .free_tensor = */ NULL, -}; - -static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 - static ggml_backend_buffer_t ggml_backend_metal_alloc_buffer(ggml_backend_t backend, size_t size) { - void * data = malloc(size + TENSOR_ALIGNMENT); // malloc may return an address that is not aligned - // TODO: maybe use GGML_ALIGNED_MALLOC? + void * data = ggml_metal_host_malloc(size); + return ggml_backend_buffer_init(backend, metal_backend_buffer_i, data, size); } static size_t ggml_backend_metal_get_alignment(ggml_backend_t backend) { - return TENSOR_ALIGNMENT; + // TODO: get page size ?? + //return sysconf(_SC_PAGESIZE); + return 32; UNUSED(backend); } @@ -1465,16 +1467,16 @@ static void ggml_backend_metal_cpy_tensor_to(ggml_backend_t backend, struct ggml } struct ggml_backend_plan_metal { - struct ggml_cplan cplan; + struct ggml_cplan cplan; struct ggml_cgraph cgraph; }; static ggml_backend_graph_plan_t ggml_backend_metal_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; + struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context; struct ggml_backend_plan_metal * metal_plan = malloc(sizeof(struct ggml_backend_plan_metal)); - metal_plan->cplan = ggml_graph_plan(cgraph, 1); + metal_plan->cplan = ggml_graph_plan(cgraph, metal_ctx->n_threads); metal_plan->cgraph = *cgraph; if (metal_plan->cplan.work_size > 0) { @@ -1494,27 +1496,20 @@ static void ggml_backend_metal_graph_plan_free(ggml_backend_t backend, ggml_back } static void ggml_backend_metal_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { - struct ggml_backend_plan_metal * metal_plan = (struct ggml_backend_plan_metal *)plan; + struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context; + struct ggml_backend_plan_metal * metal_plan = (struct ggml_backend_plan_metal *)plan; - ggml_graph_compute(&metal_plan->cgraph, &metal_plan->cplan); + ggml_metal_set_n_cb (metal_ctx->ctx, metal_ctx->n_threads); + ggml_metal_graph_compute(metal_ctx->ctx, &metal_plan->cgraph); UNUSED(backend); } static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; - - struct ggml_cplan cplan = ggml_graph_plan(cgraph, metal_ctx->n_threads); - - if (metal_ctx->work_size < cplan.work_size) { - // TODO: may be faster to free and use malloc to avoid the copy - metal_ctx->work_data = realloc(metal_ctx->work_data, cplan.work_size); - metal_ctx->work_size = cplan.work_size; - } - - cplan.work_data = metal_ctx->work_data; + struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context; - ggml_graph_compute(cgraph, &cplan); + ggml_metal_set_n_cb (metal_ctx->ctx, metal_ctx->n_threads); + ggml_metal_graph_compute(metal_ctx->ctx, cgraph); } static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { @@ -1541,7 +1536,7 @@ static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct }; ggml_backend_t ggml_backend_metal_init(void) { - struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); + struct ggml_backend_metal_context * ctx = malloc(sizeof(struct ggml_backend_metal_context)); ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend)); @@ -1552,9 +1547,3 @@ ggml_backend_t ggml_backend_metal_init(void) { return metal_backend; } - -void ggml_backend_metal_set_n_threads(ggml_backend_t backend_metal, int n_threads) { - struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend_metal->context; - - ggml_metal_set_n_cb(ctx, n_threads); -} From 77696b4955a0c866b044234b7e25372edcbdba8b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 5 Oct 2023 16:46:35 +0300 Subject: [PATCH 3/5] ggml-backend : working metal --- examples/gpt-2/main.cpp | 31 ++++++++++++++----------------- src/ggml-metal.m | 12 ++++++++++-- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp index e040ccbae0..da20540911 100644 --- a/examples/gpt-2/main.cpp +++ b/examples/gpt-2/main.cpp @@ -535,9 +535,8 @@ struct ggml_cgraph * gpt2_graph( // [ 768, N] cur = ggml_add(ctx0, ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].ln_1_g, cur), - cur), - //ggml_repeat(ctx0, model.layers[il].ln_1_b, cur)); + cur, + model.layers[il].ln_1_g), model.layers[il].ln_1_b); } @@ -555,8 +554,8 @@ struct ggml_cgraph * gpt2_graph( cur); cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur), - cur); + cur, + model.layers[il].c_attn_attn_b); } // self-attention @@ -663,8 +662,8 @@ struct ggml_cgraph * gpt2_graph( cur); cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), - cur); + cur, + model.layers[il].c_attn_proj_b); } // add the input @@ -682,9 +681,8 @@ struct ggml_cgraph * gpt2_graph( // [ 768, N] cur = ggml_add(ctx0, ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].ln_2_g, cur), - cur), - //ggml_repeat(ctx0, model.layers[il].ln_2_b, cur)); + cur, + model.layers[il].ln_2_g), model.layers[il].ln_2_b); } @@ -701,8 +699,8 @@ struct ggml_cgraph * gpt2_graph( cur); cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur), - cur); + cur, + model.layers[il].c_mlp_fc_b); // GELU activation // [3072, N] @@ -721,8 +719,8 @@ struct ggml_cgraph * gpt2_graph( cur); cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur), - cur); + cur, + model.layers[il].c_mlp_proj_b); } // input for next layer @@ -738,9 +736,8 @@ struct ggml_cgraph * gpt2_graph( // [ 768, N] inpL = ggml_add(ctx0, ggml_mul(ctx0, - ggml_repeat(ctx0, model.ln_f_g, inpL), - inpL), - //ggml_repeat(ctx0, model.ln_f_b, inpL)); + inpL, + model.ln_f_g), model.ln_f_b); } diff --git a/src/ggml-metal.m b/src/ggml-metal.m index 327fa86176..7bb663aea5 100644 --- a/src/ggml-metal.m +++ b/src/ggml-metal.m @@ -151,8 +151,6 @@ static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){ } } - - struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_LOG_INFO("%s: allocating\n", __func__); @@ -1419,8 +1417,12 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) }; static ggml_backend_buffer_t ggml_backend_metal_alloc_buffer(ggml_backend_t backend, size_t size) { + struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context; + void * data = ggml_metal_host_malloc(size); + ggml_metal_add_buffer(metal_ctx->ctx, "backend", data, size, 0); + return ggml_backend_buffer_init(backend, metal_backend_buffer_i, data, size); } @@ -1538,6 +1540,12 @@ static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_backend_t ggml_backend_metal_init(void) { struct ggml_backend_metal_context * ctx = malloc(sizeof(struct ggml_backend_metal_context)); + ctx->n_threads = GGML_DEFAULT_N_THREADS; + ctx->work_data = NULL; + ctx->work_size = 0; + + ctx->ctx = ggml_metal_init(ctx->n_threads); + ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend)); *metal_backend = (struct ggml_backend) { From e934ac2b7675b46070bb3154c19004c453daa22e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 6 Oct 2023 10:39:54 +0300 Subject: [PATCH 4/5] ggml-backend : clean-up metal implementation --- examples/gpt-2/main.cpp | 10 +++++ src/CMakeLists.txt | 2 +- src/ggml-metal.h | 37 +++++++++++------ src/ggml-metal.m | 89 +++++++++-------------------------------- 4 files changed, 53 insertions(+), 85 deletions(-) diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp index da20540911..8fb32ee865 100644 --- a/examples/gpt-2/main.cpp +++ b/examples/gpt-2/main.cpp @@ -26,6 +26,13 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) { + (void) level; + (void) user_data; + fputs(text, stderr); + fflush(stderr); +} + // default hparams (GPT-2 117M) struct gpt2_hparams { int32_t n_vocab = 50257; @@ -241,6 +248,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & #ifdef GGML_USE_METAL if (n_gpu_layers > 0) { fprintf(stderr, "%s: using Metal backend\n", __func__); + ggml_metal_log_set_callback(ggml_log_callback_default, nullptr); model.backend = ggml_backend_metal_init(); if (!model.backend) { fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); @@ -789,6 +797,8 @@ bool gpt2_eval( // run the computation if (ggml_backend_is_cpu(model.backend)) { ggml_backend_cpu_set_n_threads(model.backend, n_threads); + } else if (strcmp(ggml_backend_name(model.backend), "Metal") == 0) { + ggml_backend_metal_set_n_threads(model.backend, n_threads); } ggml_backend_graph_compute(model.backend, gf); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index bcfb4b23bf..b225597eda 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -229,7 +229,7 @@ if (GGML_METAL) set(GGML_METAL_SOURCES ggml-metal.m ggml-metal.h) add_compile_definitions(GGML_USE_METAL) - add_compile_definitions(GGML_METAL_NDEBUG) + #add_compile_definitions(GGML_METAL_NDEBUG) # get full path to the file #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/") diff --git a/src/ggml-metal.h b/src/ggml-metal.h index 250a5ef81a..bc6773a6e6 100644 --- a/src/ggml-metal.h +++ b/src/ggml-metal.h @@ -36,19 +36,24 @@ struct ggml_cgraph; extern "C" { #endif -GGML_API void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data); +// +// internal API +// temporary exposed to user-code +// struct ggml_metal_context; +void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data); + // number of command buffers to use -GGML_API struct ggml_metal_context * ggml_metal_init(int n_cb); -GGML_API void ggml_metal_free(struct ggml_metal_context * ctx); +struct ggml_metal_context * ggml_metal_init(int n_cb); +void ggml_metal_free(struct ggml_metal_context * ctx); -GGML_API void * ggml_metal_host_malloc(size_t n); -GGML_API void ggml_metal_host_free (void * data); +void * ggml_metal_host_malloc(size_t n); +void ggml_metal_host_free (void * data); // set the number of command buffers to use -GGML_API void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb); +void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb); // creates a mapping between a host memory buffer and a device memory buffer // - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute @@ -57,7 +62,7 @@ GGML_API void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb); // - max_size specifies the maximum size of a tensor and is used to create shared views such // that it is guaranteed that the tensor will fit in at least one of the views // -GGML_API bool ggml_metal_add_buffer( +bool ggml_metal_add_buffer( struct ggml_metal_context * ctx, const char * name, void * data, @@ -65,28 +70,34 @@ GGML_API bool ggml_metal_add_buffer( size_t max_size); // set data from host memory into the device -GGML_API void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); +void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); // get data from the device into host memory -GGML_API void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); +void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); // try to find operations that can be run concurrently in the graph // you should run it again if the topology of your graph changes -GGML_API void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem); +void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem); // if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized -GGML_API int ggml_metal_if_optimized(struct ggml_metal_context * ctx); +int ggml_metal_if_optimized(struct ggml_metal_context * ctx); // output the concur_list for ggml_alloc -GGML_API int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx); +int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx); // same as ggml_graph_compute but uses Metal // creates gf->n_threads command buffers in parallel -GGML_API void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); +void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); +// // backend API +// user-code should use only these functions +// + GGML_API ggml_backend_t ggml_backend_metal_init(void); +GGML_API void ggml_backend_metal_set_n_threads(ggml_backend_t backend, int n_threads); + #ifdef __cplusplus } #endif diff --git a/src/ggml-metal.m b/src/ggml-metal.m index 7bb663aea5..a06b738a19 100644 --- a/src/ggml-metal.m +++ b/src/ggml-metal.m @@ -1374,17 +1374,6 @@ void ggml_metal_graph_compute( // backend interface -#define UNUSED GGML_UNUSED - -struct ggml_backend_metal_context { - int n_threads; - - void * work_data; - size_t work_size; - - struct ggml_metal_context * ctx; -}; - static const char * ggml_backend_metal_name(ggml_backend_t backend) { return "Metal"; @@ -1392,10 +1381,8 @@ void ggml_metal_graph_compute( } static void ggml_backend_metal_free(ggml_backend_t backend) { - struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context; - ggml_metal_free(metal_ctx->ctx); - free(metal_ctx->work_data); - free(metal_ctx); + struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; + ggml_metal_free(ctx); free(backend); } @@ -1417,18 +1404,17 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) }; static ggml_backend_buffer_t ggml_backend_metal_alloc_buffer(ggml_backend_t backend, size_t size) { - struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context; + struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; void * data = ggml_metal_host_malloc(size); - ggml_metal_add_buffer(metal_ctx->ctx, "backend", data, size, 0); + // TODO: set proper name of the buffers + ggml_metal_add_buffer(ctx, "backend", data, size, 0); return ggml_backend_buffer_init(backend, metal_backend_buffer_i, data, size); } static size_t ggml_backend_metal_get_alignment(ggml_backend_t backend) { - // TODO: get page size ?? - //return sysconf(_SC_PAGESIZE); return 32; UNUSED(backend); } @@ -1462,56 +1448,15 @@ static void ggml_backend_metal_cpy_tensor_from(ggml_backend_t backend, struct gg } static void ggml_backend_metal_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { - // for a backend such as CUDA that can queue async calls, it is ok to do this asynchronously, but it may not be the case for other backends ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src)); UNUSED(backend); } -struct ggml_backend_plan_metal { - struct ggml_cplan cplan; - struct ggml_cgraph cgraph; -}; - -static ggml_backend_graph_plan_t ggml_backend_metal_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context; - - struct ggml_backend_plan_metal * metal_plan = malloc(sizeof(struct ggml_backend_plan_metal)); - - metal_plan->cplan = ggml_graph_plan(cgraph, metal_ctx->n_threads); - metal_plan->cgraph = *cgraph; - - if (metal_plan->cplan.work_size > 0) { - metal_plan->cplan.work_data = malloc(metal_plan->cplan.work_size); - } - - return metal_plan; -} - -static void ggml_backend_metal_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { - struct ggml_backend_plan_metal * metal_plan = (struct ggml_backend_plan_metal *)plan; - - free(metal_plan->cplan.work_data); - free(metal_plan); - - UNUSED(backend); -} - -static void ggml_backend_metal_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { - struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context; - struct ggml_backend_plan_metal * metal_plan = (struct ggml_backend_plan_metal *)plan; - - ggml_metal_set_n_cb (metal_ctx->ctx, metal_ctx->n_threads); - ggml_metal_graph_compute(metal_ctx->ctx, &metal_plan->cgraph); - - UNUSED(backend); -} - static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context; + struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; - ggml_metal_set_n_cb (metal_ctx->ctx, metal_ctx->n_threads); - ggml_metal_graph_compute(metal_ctx->ctx, cgraph); + ggml_metal_graph_compute(metal_ctx, cgraph); } static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { @@ -1530,21 +1475,17 @@ static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct /* .synchronize = */ ggml_backend_metal_synchronize, /* .cpy_tensor_from = */ ggml_backend_metal_cpy_tensor_from, /* .cpy_tensor_to = */ ggml_backend_metal_cpy_tensor_to, - /* .graph_plan_create = */ ggml_backend_metal_graph_plan_create, - /* .graph_plan_free = */ ggml_backend_metal_graph_plan_free, - /* .graph_plan_compute = */ ggml_backend_metal_graph_plan_compute, + /* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm + /* .graph_plan_free = */ NULL, + /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_metal_graph_compute, /* .supports_op = */ ggml_backend_metal_supports_op, }; ggml_backend_t ggml_backend_metal_init(void) { - struct ggml_backend_metal_context * ctx = malloc(sizeof(struct ggml_backend_metal_context)); - - ctx->n_threads = GGML_DEFAULT_N_THREADS; - ctx->work_data = NULL; - ctx->work_size = 0; + struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); - ctx->ctx = ggml_metal_init(ctx->n_threads); + ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS); ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend)); @@ -1555,3 +1496,9 @@ ggml_backend_t ggml_backend_metal_init(void) { return metal_backend; } + +void ggml_backend_metal_set_n_threads(ggml_backend_t backend, int n_threads) { + struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; + + ggml_metal_set_n_cb(ctx, n_threads); +} From 9415943b2891a6430ae8558aab860c8162edf422 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 6 Oct 2023 10:51:24 +0300 Subject: [PATCH 5/5] ggml-backend : add ggml_backend_is_metal() --- examples/gpt-2/main.cpp | 2 +- include/ggml/ggml-backend.h | 7 +++++-- src/ggml-backend.c | 16 ++++++++++++---- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp index 8fb32ee865..25725a1d1c 100644 --- a/examples/gpt-2/main.cpp +++ b/examples/gpt-2/main.cpp @@ -797,7 +797,7 @@ bool gpt2_eval( // run the computation if (ggml_backend_is_cpu(model.backend)) { ggml_backend_cpu_set_n_threads(model.backend, n_threads); - } else if (strcmp(ggml_backend_name(model.backend), "Metal") == 0) { + } else if (ggml_backend_is_metal(model.backend)) { ggml_backend_metal_set_n_threads(model.backend, n_threads); } ggml_backend_graph_compute(model.backend, gf); diff --git a/include/ggml/ggml-backend.h b/include/ggml/ggml-backend.h index 606ea5e4d1..36457e9910 100644 --- a/include/ggml/ggml-backend.h +++ b/include/ggml/ggml-backend.h @@ -132,14 +132,17 @@ extern "C" { GGML_API ggml_backend_t ggml_backend_cpu_init(void); - GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend); - GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads); GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); /////////////////////////// + // TODO: we should probably do something better here + GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend); + GGML_API bool ggml_backend_is_cuda (ggml_backend_t backend); + GGML_API bool ggml_backend_is_metal(ggml_backend_t backend); + #if 0 // graph splitting #define GGML_MAX_SPLITS 200 diff --git a/src/ggml-backend.c b/src/ggml-backend.c index c5bc032809..187a149c4f 100644 --- a/src/ggml-backend.c +++ b/src/ggml-backend.c @@ -369,10 +369,6 @@ ggml_backend_t ggml_backend_cpu_init(void) { return cpu_backend; } -bool ggml_backend_is_cpu(ggml_backend_t backend) { - return backend->interface.get_name == ggml_backend_cpu_name; -} - void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); @@ -385,6 +381,18 @@ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) return ggml_backend_buffer_init(NULL, cpu_backend_buffer_i_from_ptr, ptr, size); } +bool ggml_backend_is_cpu(ggml_backend_t backend) { + return backend->interface.get_name == ggml_backend_cpu_name; +} + +bool ggml_backend_is_cuda(ggml_backend_t backend) { + return strcmp(ggml_backend_name(backend), "CUDA") == 0; +} + +bool ggml_backend_is_metal(ggml_backend_t backend) { + return strcmp(ggml_backend_name(backend), "Metal") == 0; +} + #if 0 // splits