@@ -737,21 +737,32 @@ struct ggml_backend_sched_split {
737737 int i_end ;
738738 struct ggml_tensor * inputs [GGML_MAX_SPLIT_INPUTS ];
739739 int n_inputs ;
740+ // graph view of this split
740741 struct ggml_cgraph graph ;
741742};
742743
744+ // TODO: group all the hash values into a single struct for clarity
745+ //struct sched_hash_value {
746+ // ggml_tallocr_t tallocr;
747+ // struct ggml_tensor * copies[GGML_MAX_BACKENDS];
748+ //};
749+
743750struct ggml_backend_sched {
744751 int n_backends ;
745752 ggml_backend_t backends [GGML_MAX_BACKENDS ];
746753 ggml_tallocr_t tallocs [GGML_MAX_BACKENDS ];
747754
748755 ggml_gallocr_t galloc ;
749756
757+ // hash keys of the nodes in the graph
750758 struct ggml_hash_set hash_set ;
751- ggml_tallocr_t * node_talloc ; // [hash_set.size]
752- struct ggml_tensor * (* node_copies )[GGML_MAX_BACKENDS ]; // [hash_set.size][GGML_MAX_BACKENDS]
759+ // hash values (arrays of [hash_set.size])
760+ ggml_tallocr_t * node_talloc ; // tallocr assigned to each node (indirectly this is the backend)
761+ struct ggml_tensor * (* node_copies )[GGML_MAX_BACKENDS ]; // copies of each node for each destination backend
753762
763+ // copy of the graph with modified inputs
754764 struct ggml_cgraph * graph ;
765+
755766 struct ggml_backend_sched_split splits [GGML_MAX_SPLITS ];
756767 int n_splits ;
757768
@@ -928,6 +939,12 @@ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, co
928939 return dup ;
929940}
930941
942+
943+ //#define DEBUG_PASS1
944+ //#define DEBUG_PASS2
945+ //#define DEBUG_PASS3
946+ //#define DEBUG_PASS4
947+
931948// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
932949// TODO: merge passes
933950static void sched_split_graph (ggml_backend_sched_t sched , struct ggml_cgraph * graph ) {
@@ -977,42 +994,110 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
977994 node_allocr (node ) = ggml_backend_sched_get_tallocr (sched , node_backend );
978995 }
979996 }
980- //printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
997+ #ifdef DEBUG_PASS1
998+ fprintf (stderr , "PASS 1 ASSIGNMENTS\n" ); sched_print_assignments (sched , graph );
999+ #endif
9811000
9821001 // pass 2: assign backends to ops from current assignments
9831002 // start from the end and assign the same backend to previous ops
1003+
1004+ // expand gpu backends (ie non last prio) up and down, ignoring cpu
1005+ // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1006+
1007+ // pass 2.1 expand gpu up
9841008 {
9851009 ggml_tallocr_t cur_allocr = NULL ;
9861010 for (int i = graph -> n_nodes - 1 ; i >= 0 ; i -- ) {
9871011 struct ggml_tensor * node = graph -> nodes [i ];
1012+ if (ggml_is_view_op (node -> op )) {
1013+ continue ;
1014+ }
9881015 ggml_tallocr_t node_allocr = node_allocr (node );
9891016 if (node_allocr != NULL ) {
990- cur_allocr = node_allocr ;
1017+ if (sched_allocr_prio (sched , node_allocr ) == sched -> n_backends - 1 ) {
1018+ cur_allocr = NULL ;
1019+ }
1020+ else {
1021+ cur_allocr = node_allocr ;
1022+ }
9911023 } else {
9921024 node_allocr (node ) = cur_allocr ;
9931025 SET_CAUSE (node , "2.cur" );
9941026 }
9951027 }
9961028 }
9971029
998- //printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1030+ // pass 2.2 expand gpu down
1031+ {
1032+ ggml_tallocr_t cur_allocr = NULL ;
1033+ for (int i = 0 ; i < graph -> n_nodes ; i ++ ) {
1034+ struct ggml_tensor * node = graph -> nodes [i ];
1035+ if (ggml_is_view_op (node -> op )) {
1036+ continue ;
1037+ }
1038+ ggml_tallocr_t node_allocr = node_allocr (node );
1039+ if (node_allocr != NULL ) {
1040+ if (sched_allocr_prio (sched , node_allocr ) == sched -> n_backends - 1 ) {
1041+ cur_allocr = NULL ;
1042+ }
1043+ else {
1044+ cur_allocr = node_allocr ;
1045+ }
1046+ } else {
1047+ node_allocr (node ) = cur_allocr ;
1048+ SET_CAUSE (node , "2.cur" );
1049+ }
1050+ }
1051+ }
1052+
1053+ // pass 2.3 expand rest up
1054+ {
1055+ ggml_tallocr_t cur_allocr = NULL ;
1056+ for (int i = graph -> n_nodes - 1 ; i >= 0 ; i -- ) {
1057+ struct ggml_tensor * node = graph -> nodes [i ];
1058+ if (ggml_is_view_op (node -> op )) {
1059+ continue ;
1060+ }
1061+ ggml_tallocr_t node_allocr = node_allocr (node );
1062+ if (node_allocr != NULL ) {
1063+ cur_allocr = node_allocr ;
1064+ } else {
1065+ node_allocr (node ) = cur_allocr ;
1066+ SET_CAUSE (node , "2.cur" );
1067+ }
1068+ }
1069+ }
1070+ #ifdef DEBUG_PASS2
1071+ fprintf (stderr , "PASS 2 ASSIGNMENTS\n" ); sched_print_assignments (sched , graph );
1072+ #endif
9991073
1000- // pass 3: assign backends to remaining src from dst (should only be leafs)
1074+ // pass 3: assign backends to remaining src from dst and view_src
10011075 for (int i = 0 ; i < graph -> n_nodes ; i ++ ) {
10021076 struct ggml_tensor * node = graph -> nodes [i ];
1003- ggml_tallocr_t node_allocr = node_allocr (node );
1077+ ggml_tallocr_t cur_allocr = node_allocr (node );
1078+ if (ggml_is_view_op (node -> op ) && cur_allocr == NULL ) {
1079+ cur_allocr = node_allocr (node ) = node_allocr (node -> view_src );
1080+ SET_CAUSE (node , "3.vsrc" );
1081+ }
10041082 for (int j = 0 ; j < GGML_MAX_SRC ; j ++ ) {
10051083 struct ggml_tensor * src = node -> src [j ];
10061084 if (src == NULL ) {
10071085 break ;
10081086 }
10091087 ggml_tallocr_t src_allocr = node_allocr (src );
10101088 if (src_allocr == NULL ) {
1011- node_allocr (src ) = node_allocr ;
1089+ if (src -> view_src != NULL ) {
1090+ // views are always on the same backend as the source
1091+ node_allocr (src ) = node_allocr (src -> view_src );
1092+ } else {
1093+ node_allocr (src ) = cur_allocr ;
1094+ }
10121095 }
10131096 }
10141097 }
1015- //printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1098+ #ifdef DEBUG_PASS3
1099+ fprintf (stderr , "PASS 3 ASSIGNMENTS\n" ); sched_print_assignments (sched , graph );
1100+ #endif
10161101
10171102 // pass 4: split graph, find tensors that need to be copied
10181103 {
@@ -1074,7 +1159,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
10741159 sched -> splits [cur_split ].inputs [n_inputs ] = (struct ggml_tensor * )src ;
10751160 }
10761161
1077- // create copies
1162+ // create a copy of the input in the split's backend
10781163 size_t id = hash_id (src );
10791164 if (sched -> node_copies [id ][cur_backend_id ] == NULL ) {
10801165 struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout (sched -> ctx , src );
@@ -1090,8 +1175,9 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
10901175 sched -> splits [cur_split ].i_end = graph -> n_nodes ;
10911176 sched -> n_splits = cur_split + 1 ;
10921177 }
1093-
1094- //fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1178+ #ifdef DEBUG_PASS4
1179+ fprintf (stderr , "PASS 4 ASSIGNMENTS\n" ); sched_print_assignments (sched , graph );
1180+ #endif
10951181
10961182#ifndef NDEBUG
10971183 // sanity check: all sources should have the same backend as the node
@@ -1101,6 +1187,11 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
11011187 if (node_allocr == NULL ) {
11021188 fprintf (stderr , "!!!!!!! %s has no backend\n" , node -> name );
11031189 }
1190+ if (node -> view_src != NULL && node_allocr != node_allocr (node -> view_src )) {
1191+ fprintf (stderr , "!!!!!!! %s has backend %s, view_src %s has backend %s\n" ,
1192+ node -> name , node_allocr ? ggml_backend_name (get_allocr_backend (sched , node_allocr )) : "NULL" ,
1193+ node -> view_src -> name , node_allocr (node -> view_src ) ? ggml_backend_name (get_allocr_backend (sched , node_allocr (node -> view_src ))) : "NULL" );
1194+ }
11041195 for (int j = 0 ; j < GGML_MAX_SRC ; j ++ ) {
11051196 struct ggml_tensor * src = node -> src [j ];
11061197 if (src == NULL ) {
@@ -1112,8 +1203,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
11121203 node -> name , node_allocr ? ggml_backend_name (get_allocr_backend (sched , node_allocr )) : "NULL" ,
11131204 j , src -> name , src_allocr ? ggml_backend_name (get_allocr_backend (sched , src_allocr )) : "NULL" );
11141205 }
1206+ if (src -> view_src != NULL && src_allocr != node_allocr (src -> view_src )) {
1207+ fprintf (stderr , "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n" ,
1208+ src -> name , src_allocr ? ggml_backend_name (get_allocr_backend (sched , src_allocr )) : "NULL" ,
1209+ src -> view_src -> name , node_allocr (src -> view_src ) ? ggml_backend_name (get_allocr_backend (sched , node_allocr (src -> view_src ))) : "NULL" );
1210+ }
11151211 }
11161212 }
1213+ fflush (stderr );
11171214#endif
11181215
11191216 // create copies of the graph for each split
@@ -1127,6 +1224,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
11271224 for (int j = 0 ; j < split -> n_inputs ; j ++ ) {
11281225 struct ggml_tensor * input = split -> inputs [j ];
11291226 struct ggml_tensor * input_cpy = sched -> node_copies [hash_id (input )][sched_allocr_prio (sched , split -> tallocr )];
1227+ // add a dependency to the input source so that it is not freed before the copy is done
11301228 input_cpy -> src [0 ] = input ;
11311229 graph_copy -> nodes [graph_copy -> n_nodes ++ ] = input_cpy ;
11321230 }
@@ -1163,19 +1261,20 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
11631261 struct ggml_tensor * input = split -> inputs [j ];
11641262 struct ggml_tensor * input_cpy = sched -> node_copies [hash_id (input )][sched_backend_prio (sched , split_backend )];
11651263 if (input -> buffer == NULL ) {
1264+ GGML_ASSERT (false);
11661265 if (input -> view_src == NULL ) {
11671266 fprintf (stderr , "input %s has no buffer and no view_src\n" , input -> name );
1168- exit ( 1 );
1267+ GGML_ASSERT (false );
11691268 }
11701269 // FIXME: may need to use the sched buffer instead
11711270 ggml_backend_view_init (input -> view_src -> buffer , input );
11721271 }
11731272 if (input_cpy -> buffer == NULL ) {
11741273 fprintf (stderr , "input_cpy %s has no buffer\n" , input_cpy -> name );
1175- exit ( 1 );
1274+ GGML_ASSERT (false );
11761275 }
1177- //GGML_ASSERT( input->buffer->backend != input_cpy->buffer->backend);
1178- //GGML_ASSERT(input_cpy->buffer->backend == split_backend);
1276+ // TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
1277+ // this is important to avoid copying constants such as KQ_mask and inp_pos multiple time
11791278 ggml_backend_tensor_copy (input , input_cpy );
11801279 }
11811280 // ggml_backend_synchronize(split_backend);
@@ -1301,6 +1400,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
13011400}
13021401
13031402// utils
1403+
13041404void ggml_backend_view_init (ggml_backend_buffer_t buffer , struct ggml_tensor * tensor ) {
13051405 GGML_ASSERT (tensor -> buffer == NULL );
13061406 //GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
0 commit comments