@@ -3846,6 +3846,40 @@ static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
3846
3846
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
3847
3847
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
3848
3848
3849
+ // WARN:
3850
+ // Mis-confguration can lead to problem that's hard to reason about:
3851
+ // * At best it crash or talks nosense.
3852
+ // * At worst it talks slightly difference but hard to perceive.
3853
+ //
3854
+ // An op has to enable INIT or FINALIZE when any of it's branch needs that pass.
3855
+ // Take care about compile options (e.g., GGML_USE_xxx).
3856
+ static bool GGML_OP_HAS_INIT [GGML_OP_COUNT] = { 0 };
3857
+ static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 };
3858
+ static void ggml_setup_op_has_task_pass(void) {
3859
+ { // INIT
3860
+ bool * I = GGML_OP_HAS_INIT;
3861
+
3862
+ I[GGML_OP_ACC ] = true;
3863
+ I[GGML_OP_MUL_MAT ] = true;
3864
+ I[GGML_OP_OUT_PROD ] = true;
3865
+ I[GGML_OP_SET ] = true;
3866
+ I[GGML_OP_GET_ROWS_BACK ] = true;
3867
+ I[GGML_OP_DIAG_MASK_INF ] = true;
3868
+ I[GGML_OP_DIAG_MASK_ZERO ] = true;
3869
+ I[GGML_OP_CONV_1D_S1_PH ] = true;
3870
+ I[GGML_OP_CONV_1D_S2_PH ] = true;
3871
+ I[GGML_OP_CONV_2D_SK_P0 ] = true;
3872
+ I[GGML_OP_FLASH_ATTN_BACK ] = true;
3873
+ I[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
3874
+ }
3875
+
3876
+ { // FINALIZE
3877
+ bool * F = GGML_OP_HAS_FINALIZE;
3878
+
3879
+ F[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
3880
+ }
3881
+ }
3882
+
3849
3883
//
3850
3884
// ggml context
3851
3885
//
@@ -4267,6 +4301,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4267
4301
ggml_cl_init();
4268
4302
#endif
4269
4303
4304
+ ggml_setup_op_has_task_pass();
4305
+
4270
4306
is_first_call = false;
4271
4307
}
4272
4308
@@ -16791,9 +16827,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16791
16827
if (node_n != -1) {
16792
16828
/* FINALIZE */
16793
16829
struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
16794
- params.nth = node->n_tasks;
16795
- ggml_compute_forward(¶ms, node);
16796
- ggml_graph_compute_perf_stats_node(node, state->shared);
16830
+ if (GGML_OP_HAS_FINALIZE[node->op]) {
16831
+ params.nth = node->n_tasks;
16832
+ ggml_compute_forward(¶ms, node);
16833
+ ggml_graph_compute_perf_stats_node(node, state->shared);
16834
+ }
16797
16835
}
16798
16836
16799
16837
// distribute new work or execute it direct if 1T
@@ -16805,20 +16843,25 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16805
16843
state->shared->perf_node_start_cycles = ggml_perf_cycles();
16806
16844
state->shared->perf_node_start_time_us = ggml_perf_time_us();
16807
16845
16846
+ params.nth = node->n_tasks;
16847
+
16808
16848
/* INIT */
16809
- params.type = GGML_TASK_INIT;
16810
- params.nth = node->n_tasks;
16811
- ggml_compute_forward(¶ms, node);
16849
+ if (GGML_OP_HAS_INIT[node->op]) {
16850
+ params.type = GGML_TASK_INIT;
16851
+ ggml_compute_forward(¶ms, node);
16852
+ }
16812
16853
16813
16854
if (node->n_tasks == 1) {
16814
16855
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
16815
16856
// they do something more efficient than spinning (?)
16816
16857
params.type = GGML_TASK_COMPUTE;
16817
16858
ggml_compute_forward(¶ms, node);
16818
16859
16819
- params.type = GGML_TASK_FINALIZE;
16820
- ggml_compute_forward(¶ms, node);
16821
- ggml_graph_compute_perf_stats_node(node, state->shared);
16860
+ if (GGML_OP_HAS_FINALIZE[node->op]) {
16861
+ params.type = GGML_TASK_FINALIZE;
16862
+ ggml_compute_forward(¶ms, node);
16863
+ ggml_graph_compute_perf_stats_node(node, state->shared);
16864
+ }
16822
16865
} else {
16823
16866
break;
16824
16867
}
0 commit comments