Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions examples/gpt-2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
if (GGML_CUBLAS)
add_compile_definitions(GGML_USE_CUBLAS)
endif()

if (GGML_CLBLAST)
add_compile_definitions(GGML_USE_CLBLAST)
endif()

if (GGML_METAL)
add_compile_definitions(GGML_USE_METAL)
endif()
55 changes: 38 additions & 17 deletions examples/gpt-2/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
#include "ggml-cuda.h"
#endif

#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif

#include "common.h"
#include "common-ggml.h"

Expand All @@ -22,6 +26,13 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif

static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) user_data;
fputs(text, stderr);
fflush(stderr);
}

// default hparams (GPT-2 117M)
struct gpt2_hparams {
int32_t n_vocab = 50257;
Expand Down Expand Up @@ -234,6 +245,17 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
}
#endif

#ifdef GGML_USE_METAL
if (n_gpu_layers > 0) {
fprintf(stderr, "%s: using Metal backend\n", __func__);
ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
model.backend = ggml_backend_metal_init();
if (!model.backend) {
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
}
}
#endif

if (!model.backend) {
// fallback to CPU backend
fprintf(stderr, "%s: using CPU backend\n", __func__);
Expand Down Expand Up @@ -521,9 +543,8 @@ struct ggml_cgraph * gpt2_graph(
// [ 768, N]
cur = ggml_add(ctx0,
ggml_mul(ctx0,
ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
cur),
//ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
cur,
model.layers[il].ln_1_g),
model.layers[il].ln_1_b);
}

Expand All @@ -541,8 +562,8 @@ struct ggml_cgraph * gpt2_graph(
cur);

cur = ggml_add(ctx0,
ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
cur);
cur,
model.layers[il].c_attn_attn_b);
}

// self-attention
Expand Down Expand Up @@ -649,8 +670,8 @@ struct ggml_cgraph * gpt2_graph(
cur);

cur = ggml_add(ctx0,
ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
cur);
cur,
model.layers[il].c_attn_proj_b);
}

// add the input
Expand All @@ -668,9 +689,8 @@ struct ggml_cgraph * gpt2_graph(
// [ 768, N]
cur = ggml_add(ctx0,
ggml_mul(ctx0,
ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
cur),
//ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
cur,
model.layers[il].ln_2_g),
model.layers[il].ln_2_b);
}

Expand All @@ -687,8 +707,8 @@ struct ggml_cgraph * gpt2_graph(
cur);

cur = ggml_add(ctx0,
ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
cur);
cur,
model.layers[il].c_mlp_fc_b);

// GELU activation
// [3072, N]
Expand All @@ -707,8 +727,8 @@ struct ggml_cgraph * gpt2_graph(
cur);

cur = ggml_add(ctx0,
ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
cur);
cur,
model.layers[il].c_mlp_proj_b);
}

// input for next layer
Expand All @@ -724,9 +744,8 @@ struct ggml_cgraph * gpt2_graph(
// [ 768, N]
inpL = ggml_add(ctx0,
ggml_mul(ctx0,
ggml_repeat(ctx0, model.ln_f_g, inpL),
inpL),
//ggml_repeat(ctx0, model.ln_f_b, inpL));
inpL,
model.ln_f_g),
model.ln_f_b);
}

Expand Down Expand Up @@ -778,6 +797,8 @@ bool gpt2_eval(
// run the computation
if (ggml_backend_is_cpu(model.backend)) {
ggml_backend_cpu_set_n_threads(model.backend, n_threads);
} else if (ggml_backend_is_metal(model.backend)) {
ggml_backend_metal_set_n_threads(model.backend, n_threads);
}
ggml_backend_graph_compute(model.backend, gf);

Expand Down
7 changes: 5 additions & 2 deletions include/ggml/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,14 +132,17 @@ extern "C" {

GGML_API ggml_backend_t ggml_backend_cpu_init(void);

GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);

GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);

GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);

///////////////////////////

// TODO: we should probably do something better here
GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_API bool ggml_backend_is_cuda (ggml_backend_t backend);
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);

#if 0
// graph splitting
#define GGML_MAX_SPLITS 200
Expand Down
2 changes: 1 addition & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ if (GGML_METAL)
set(GGML_METAL_SOURCES ggml-metal.m ggml-metal.h)

add_compile_definitions(GGML_USE_METAL)
add_compile_definitions(GGML_METAL_NDEBUG)
#add_compile_definitions(GGML_METAL_NDEBUG)

# get full path to the file
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
Expand Down
16 changes: 12 additions & 4 deletions src/ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -369,10 +369,6 @@ ggml_backend_t ggml_backend_cpu_init(void) {
return cpu_backend;
}

bool ggml_backend_is_cpu(ggml_backend_t backend) {
return backend->interface.get_name == ggml_backend_cpu_name;
}

void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));

Expand All @@ -385,6 +381,18 @@ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size)
return ggml_backend_buffer_init(NULL, cpu_backend_buffer_i_from_ptr, ptr, size);
}

bool ggml_backend_is_cpu(ggml_backend_t backend) {
return backend->interface.get_name == ggml_backend_cpu_name;
}

bool ggml_backend_is_cuda(ggml_backend_t backend) {
return strcmp(ggml_backend_name(backend), "CUDA") == 0;
}

bool ggml_backend_is_metal(ggml_backend_t backend) {
return strcmp(ggml_backend_name(backend), "Metal") == 0;
}

#if 0
// splits

Expand Down
1 change: 0 additions & 1 deletion src/ggml-cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ GGML_API void ggml_cuda_get_device_description(int device, char * description,
// backend API
GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use


#ifdef __cplusplus
}
#endif
17 changes: 16 additions & 1 deletion src/ggml-metal.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#pragma once

#include "ggml.h"
#include "ggml-backend.h"

#include <stddef.h>
#include <stdbool.h>
Expand All @@ -35,10 +36,15 @@ struct ggml_cgraph;
extern "C" {
#endif

void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
//
// internal API
// temporary exposed to user-code
//

struct ggml_metal_context;

void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);

// number of command buffers to use
struct ggml_metal_context * ggml_metal_init(int n_cb);
void ggml_metal_free(struct ggml_metal_context * ctx);
Expand Down Expand Up @@ -83,6 +89,15 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
// creates gf->n_threads command buffers in parallel
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);

//
// backend API
// user-code should use only these functions
//

GGML_API ggml_backend_t ggml_backend_metal_init(void);

GGML_API void ggml_backend_metal_set_n_threads(ggml_backend_t backend, int n_threads);

#ifdef __cplusplus
}
#endif
Expand Down
135 changes: 133 additions & 2 deletions src/ggml-metal.m
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,6 @@ static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){
}
}



struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_LOG_INFO("%s: allocating\n", __func__);

Expand Down Expand Up @@ -1371,3 +1369,136 @@ void ggml_metal_graph_compute(

}
}

////////////////////////////////////////////////////////////////////////////////

// backend interface

static const char * ggml_backend_metal_name(ggml_backend_t backend) {
return "Metal";

UNUSED(backend);
}

static void ggml_backend_metal_free(ggml_backend_t backend) {
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
ggml_metal_free(ctx);
free(backend);
}

static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
return (void *)buffer->context;
}

static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free(buffer->context);
UNUSED(buffer);
}

static struct ggml_backend_buffer_i metal_backend_buffer_i = {
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
/* .get_base = */ ggml_backend_metal_buffer_get_base,
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .init_tensor = */ NULL, // no initialization required
/* .free_tensor = */ NULL, // no cleanup required
};

static ggml_backend_buffer_t ggml_backend_metal_alloc_buffer(ggml_backend_t backend, size_t size) {
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;

void * data = ggml_metal_host_malloc(size);

// TODO: set proper name of the buffers
ggml_metal_add_buffer(ctx, "backend", data, size, 0);

return ggml_backend_buffer_init(backend, metal_backend_buffer_i, data, size);
}

static size_t ggml_backend_metal_get_alignment(ggml_backend_t backend) {
return 32;
UNUSED(backend);
}

static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");

memcpy((char *)tensor->data + offset, data, size);

UNUSED(backend);
}

static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");

memcpy(data, (const char *)tensor->data + offset, size);

UNUSED(backend);
}

static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
UNUSED(backend);
}

static void ggml_backend_metal_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));

UNUSED(backend);
}

static void ggml_backend_metal_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));

UNUSED(backend);
}

static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;

ggml_metal_graph_compute(metal_ctx, cgraph);
}

static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
return true;
UNUSED(backend);
UNUSED(op);
}

static struct ggml_backend_i metal_backend_i = {
/* .get_name = */ ggml_backend_metal_name,
/* .free = */ ggml_backend_metal_free,
/* .alloc_buffer = */ ggml_backend_metal_alloc_buffer,
/* .get_alignment = */ ggml_backend_metal_get_alignment,
/* .set_tensor_async = */ ggml_backend_metal_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_metal_get_tensor_async,
/* .synchronize = */ ggml_backend_metal_synchronize,
/* .cpy_tensor_from = */ ggml_backend_metal_cpy_tensor_from,
/* .cpy_tensor_to = */ ggml_backend_metal_cpy_tensor_to,
/* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm
/* .graph_plan_free = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_metal_graph_compute,
/* .supports_op = */ ggml_backend_metal_supports_op,
};

ggml_backend_t ggml_backend_metal_init(void) {
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));

ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);

ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend));

*metal_backend = (struct ggml_backend) {
/* .interface = */ metal_backend_i,
/* .context = */ ctx,
};

return metal_backend;
}

void ggml_backend_metal_set_n_threads(ggml_backend_t backend, int n_threads) {
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;

ggml_metal_set_n_cb(ctx, n_threads);
}