@@ -1340,7 +1340,10 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1340
1340
// allocate graph
1341
1341
if (backend_ids_changed || !ggml_gallocr_alloc_graph (sched->galloc , &sched->graph )) {
1342
1342
// the re-allocation may cause the split inputs to be moved to a different address
1343
- ggml_backend_sched_synchronize (sched);
1343
+ // synchronize without ggml_backend_sched_synchronize to avoid changing cur_copy
1344
+ for (int i = 0 ; i < sched->n_backends ; i++) {
1345
+ ggml_backend_synchronize (sched->backends [i]);
1346
+ }
1344
1347
#ifndef NDEBUG
1345
1348
GGML_LOG_DEBUG (" %s: failed to allocate graph, reserving (backend_ids_changed = %d)\n " , __func__, backend_ids_changed);
1346
1349
#endif
@@ -1564,7 +1567,6 @@ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgra
1564
1567
1565
1568
ggml_backend_sched_split_graph (sched, graph);
1566
1569
1567
-
1568
1570
if (!ggml_backend_sched_alloc_splits (sched)) {
1569
1571
return false ;
1570
1572
}
@@ -1598,9 +1600,12 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
1598
1600
for (int i = 0 ; i < sched->n_backends ; i++) {
1599
1601
ggml_backend_synchronize (sched->backends [i]);
1600
1602
}
1601
- // reset the current copy to 0 so that the graphs will be similar during generation
1602
- // necessary for CUDA graphs
1603
- sched->cur_copy = 0 ;
1603
+ if (!sched->is_alloc ) {
1604
+ // if the graph is not already allocated, always use copy 0 after a synchronization
1605
+ // this ensures that during generation the same copy is used every time,
1606
+ // which avoids changes in the graph that could cause CUDA or other graphs to be disabled
1607
+ sched->cur_copy = 0 ;
1608
+ }
1604
1609
}
1605
1610
1606
1611
void ggml_backend_sched_set_eval_callback (ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
0 commit comments