@@ -768,6 +768,10 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
768
768
769
769
if (cpu_plan -> cplan .work_size > 0 ) {
770
770
cpu_plan -> cplan .work_data = malloc (cpu_plan -> cplan .work_size );
771
+ if (cpu_plan -> cplan .work_data == NULL ) {
772
+ free (cpu_plan );
773
+ return NULL ;
774
+ }
771
775
}
772
776
773
777
cpu_plan -> cplan .abort_callback = cpu_ctx -> abort_callback ;
@@ -1007,11 +1011,11 @@ static bool ggml_is_view_op(enum ggml_op op) {
1007
1011
#endif
1008
1012
1009
1013
#ifndef GGML_SCHED_MAX_SPLITS
1010
- #define GGML_SCHED_MAX_SPLITS 1024
1014
+ #define GGML_SCHED_MAX_SPLITS 2048
1011
1015
#endif
1012
1016
1013
1017
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
1014
- #define GGML_SCHED_MAX_SPLIT_INPUTS 16
1018
+ #define GGML_SCHED_MAX_SPLIT_INPUTS 4
1015
1019
#endif
1016
1020
1017
1021
#ifndef GGML_SCHED_MAX_COPIES
@@ -1422,31 +1426,43 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1422
1426
1423
1427
GGML_ASSERT (node_backend_id != -1 ); // all nodes should be assigned by now
1424
1428
1425
- // check if a weight is on a different backend and start a new split if so
1426
- // by starting a new split, the memory of the previously offloaded weights can be reused
1427
- bool offload = false;
1429
+ // check if we should start a new split based on the sources of the current node
1430
+ bool need_new_split = false;
1428
1431
if (node_backend_id == cur_backend_id && split -> n_inputs > 0 ) {
1429
1432
for (int j = 0 ; j < GGML_MAX_SRC ; j ++ ) {
1430
1433
struct ggml_tensor * src = node -> src [j ];
1431
1434
if (src == NULL ) {
1432
1435
continue ;
1433
1436
}
1437
+ // check if a weight is on a different backend
1438
+ // by starting a new split, the memory of the previously offloaded weights can be reused
1434
1439
if (src -> buffer != NULL && src -> buffer -> usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS ) {
1435
1440
int src_backend_id = tensor_backend_id (src );
1436
1441
if (src_backend_id != -1 && src_backend_id != cur_backend_id ) {
1437
- offload = true;
1442
+ need_new_split = true;
1443
+ break ;
1444
+ }
1445
+ }
1446
+ // check if the split has too many inputs
1447
+ if (split -> n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS ) {
1448
+ const size_t id = hash_id (src );
1449
+ int src_backend_id = sched -> tensor_backend_id [id ];
1450
+ if (src_backend_id != cur_backend_id && sched -> tensor_copies [hash_id (src )][cur_backend_id ][0 ] == NULL ) {
1451
+ //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1452
+ need_new_split = true;
1438
1453
break ;
1439
1454
}
1440
1455
}
1441
1456
}
1442
1457
}
1443
1458
1444
- if (node_backend_id != cur_backend_id || offload ) {
1459
+ if (node_backend_id != cur_backend_id || need_new_split ) {
1445
1460
split -> i_end = i ;
1446
1461
i_split ++ ;
1447
1462
if (i_split >= sched -> splits_capacity ) {
1448
1463
sched -> splits_capacity *= 2 ;
1449
1464
sched -> splits = realloc (sched -> splits , sched -> splits_capacity * sizeof (struct ggml_backend_sched_split ));
1465
+ GGML_ASSERT (sched -> splits != NULL );
1450
1466
}
1451
1467
GGML_ASSERT (i_split < GGML_SCHED_MAX_SPLITS );
1452
1468
split = & sched -> splits [i_split ];
@@ -1523,13 +1539,15 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1523
1539
1524
1540
// create copies of the graph for each split
1525
1541
// TODO: avoid this copy
1526
- struct ggml_cgraph * graph_copy = ggml_new_graph_custom (sched -> ctx , graph -> n_nodes + sched -> n_splits * GGML_SCHED_MAX_SPLIT_INPUTS , false);
1542
+ struct ggml_cgraph * graph_copy = ggml_new_graph_custom (sched -> ctx , graph -> n_nodes + sched -> n_splits * GGML_SCHED_MAX_SPLIT_INPUTS * 2 , false);
1527
1543
for (int i = 0 ; i < sched -> n_splits ; i ++ ) {
1528
1544
struct ggml_backend_sched_split * split = & sched -> splits [i ];
1529
1545
split -> graph = ggml_graph_view (graph , split -> i_start , split -> i_end );
1530
1546
1531
1547
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1532
1548
for (int j = 0 ; j < split -> n_inputs ; j ++ ) {
1549
+ assert (graph_copy -> size > (graph_copy -> n_nodes + 1 ));
1550
+
1533
1551
struct ggml_tensor * input = split -> inputs [j ];
1534
1552
const size_t input_id = hash_id (input );
1535
1553
struct ggml_tensor * input_cpy = sched -> tensor_copies [input_id ][split -> backend_id ][sched -> cur_copy ];
@@ -1546,6 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1546
1564
}
1547
1565
1548
1566
for (int j = split -> i_start ; j < split -> i_end ; j ++ ) {
1567
+ assert (graph_copy -> size > graph_copy -> n_nodes );
1549
1568
sched -> node_backend_ids [graph_copy -> n_nodes ] = tensor_backend_id (graph -> nodes [j ]);
1550
1569
graph_copy -> nodes [graph_copy -> n_nodes ++ ] = graph -> nodes [j ];
1551
1570
}
@@ -1630,13 +1649,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
1630
1649
}
1631
1650
ggml_backend_tensor_copy (input , input_cpy );
1632
1651
} else {
1652
+ // wait for the split backend to finish using the input before overwriting it
1633
1653
if (sched -> events [split_backend_id ][sched -> cur_copy ] != NULL ) {
1634
1654
ggml_backend_event_wait (split_backend , sched -> events [split_backend_id ][sched -> cur_copy ]);
1635
1655
} else {
1636
1656
ggml_backend_synchronize (split_backend );
1637
- ggml_backend_synchronize (input_backend );
1638
1657
}
1639
-
1640
1658
ggml_backend_tensor_copy_async (input_backend , split_backend , input , input_cpy );
1641
1659
}
1642
1660
}
@@ -1709,8 +1727,10 @@ ggml_backend_sched_t ggml_backend_sched_new(
1709
1727
sched -> hash_set = ggml_hash_set_new (graph_size );
1710
1728
sched -> tensor_backend_id = calloc (sizeof (sched -> tensor_backend_id [0 ]), sched -> hash_set .size );
1711
1729
sched -> tensor_copies = calloc (sizeof (sched -> tensor_copies [0 ]), sched -> hash_set .size );
1712
- sched -> node_backend_ids = calloc (sizeof (sched -> node_backend_ids [0 ]), graph_size );
1713
- sched -> leaf_backend_ids = calloc (sizeof (sched -> leaf_backend_ids [0 ]), graph_size );
1730
+
1731
+ const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS * GGML_SCHED_MAX_SPLIT_INPUTS * 2 ;
1732
+ sched -> node_backend_ids = calloc (sizeof (sched -> node_backend_ids [0 ]), nodes_size );
1733
+ sched -> leaf_backend_ids = calloc (sizeof (sched -> leaf_backend_ids [0 ]), nodes_size );
1714
1734
1715
1735
sched -> n_backends = n_backends ;
1716
1736
@@ -1770,6 +1790,8 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1770
1790
}
1771
1791
1772
1792
bool ggml_backend_sched_reserve (ggml_backend_sched_t sched , struct ggml_cgraph * measure_graph ) {
1793
+ GGML_ASSERT ((int )sched -> hash_set .size >= measure_graph -> n_nodes );
1794
+
1773
1795
ggml_backend_sched_split_graph (sched , measure_graph );
1774
1796
1775
1797
// TODO: extract this to a separate function
0 commit comments