@@ -3846,6 +3846,40 @@ static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
38463846static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
38473847static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
38483848
3849+ // WARN:
3850+ // Mis-confguration can lead to problem that's hard to reason about:
3851+ // * At best it crash or talks nosense.
3852+ // * At worst it talks slightly difference but hard to perceive.
3853+ //
3854+ // An op has to enable INIT or FINALIZE when any of it's branch needs that pass.
3855+ // Take care about compile options (e.g., GGML_USE_xxx).
3856+ static bool GGML_OP_HAS_INIT [GGML_OP_COUNT] = { 0 };
3857+ static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 };
3858+ static void ggml_setup_op_has_task_pass(void) {
3859+ { // INIT
3860+ bool * I = GGML_OP_HAS_INIT;
3861+
3862+ I[GGML_OP_ACC ] = true;
3863+ I[GGML_OP_MUL_MAT ] = true;
3864+ I[GGML_OP_OUT_PROD ] = true;
3865+ I[GGML_OP_SET ] = true;
3866+ I[GGML_OP_GET_ROWS_BACK ] = true;
3867+ I[GGML_OP_DIAG_MASK_INF ] = true;
3868+ I[GGML_OP_DIAG_MASK_ZERO ] = true;
3869+ I[GGML_OP_CONV_1D_S1_PH ] = true;
3870+ I[GGML_OP_CONV_1D_S2_PH ] = true;
3871+ I[GGML_OP_CONV_2D_SK_P0 ] = true;
3872+ I[GGML_OP_FLASH_ATTN_BACK ] = true;
3873+ I[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
3874+ }
3875+
3876+ { // FINALIZE
3877+ bool * F = GGML_OP_HAS_FINALIZE;
3878+
3879+ F[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
3880+ }
3881+ }
3882+
38493883//
38503884// ggml context
38513885//
@@ -4267,6 +4301,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
42674301 ggml_cl_init();
42684302#endif
42694303
4304+ ggml_setup_op_has_task_pass();
4305+
42704306 is_first_call = false;
42714307 }
42724308
@@ -16791,9 +16827,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1679116827 if (node_n != -1) {
1679216828 /* FINALIZE */
1679316829 struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
16794- params.nth = node->n_tasks;
16795- ggml_compute_forward(¶ms, node);
16796- ggml_graph_compute_perf_stats_node(node, state->shared);
16830+ if (GGML_OP_HAS_FINALIZE[node->op]) {
16831+ params.nth = node->n_tasks;
16832+ ggml_compute_forward(¶ms, node);
16833+ ggml_graph_compute_perf_stats_node(node, state->shared);
16834+ }
1679716835 }
1679816836
1679916837 // distribute new work or execute it direct if 1T
@@ -16805,20 +16843,25 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1680516843 state->shared->perf_node_start_cycles = ggml_perf_cycles();
1680616844 state->shared->perf_node_start_time_us = ggml_perf_time_us();
1680716845
16846+ params.nth = node->n_tasks;
16847+
1680816848 /* INIT */
16809- params.type = GGML_TASK_INIT;
16810- params.nth = node->n_tasks;
16811- ggml_compute_forward(¶ms, node);
16849+ if (GGML_OP_HAS_INIT[node->op]) {
16850+ params.type = GGML_TASK_INIT;
16851+ ggml_compute_forward(¶ms, node);
16852+ }
1681216853
1681316854 if (node->n_tasks == 1) {
1681416855 // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
1681516856 // they do something more efficient than spinning (?)
1681616857 params.type = GGML_TASK_COMPUTE;
1681716858 ggml_compute_forward(¶ms, node);
1681816859
16819- params.type = GGML_TASK_FINALIZE;
16820- ggml_compute_forward(¶ms, node);
16821- ggml_graph_compute_perf_stats_node(node, state->shared);
16860+ if (GGML_OP_HAS_FINALIZE[node->op]) {
16861+ params.type = GGML_TASK_FINALIZE;
16862+ ggml_compute_forward(¶ms, node);
16863+ ggml_graph_compute_perf_stats_node(node, state->shared);
16864+ }
1682216865 } else {
1682316866 break;
1682416867 }
0 commit comments