@@ -1312,6 +1312,40 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1312
1312
else { throw std::invalid_argument (" invalid value" ); }
1313
1313
}
1314
1314
).set_env (" LLAMA_ARG_NUMA" ));
1315
+ add_opt (common_arg (
1316
+ {" -dev" , " --device" }, " <dev1,dev2,..>" ,
1317
+ " comma-separated list of devices to use for offloading\n "
1318
+ " use --list-devices to see a list of available devices" ,
1319
+ [](common_params & params, const std::string & value) {
1320
+ auto devices = string_split<std::string>(value, ' ,' );
1321
+ if (devices.empty ()) {
1322
+ throw std::invalid_argument (" no devices specified" );
1323
+ }
1324
+ for (const auto & device : devices) {
1325
+ auto * dev = ggml_backend_dev_by_name (device.c_str ());
1326
+ if (!dev || ggml_backend_dev_type (dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
1327
+ throw std::invalid_argument (string_format (" invalid device: %s" , device.c_str ()));
1328
+ }
1329
+ params.devices .push_back (dev);
1330
+ }
1331
+ params.devices .push_back (nullptr );
1332
+ }
1333
+ ).set_env (" LLAMA_ARG_DEVICES" ));
1334
+ add_opt (common_arg (
1335
+ {" --list-devices" },
1336
+ " print list of available devices and exit" ,
1337
+ [](common_params &) {
1338
+ for (size_t i = 0 ; i < ggml_backend_dev_count (); ++i) {
1339
+ auto * dev = ggml_backend_dev_get (i);
1340
+ if (ggml_backend_dev_type (dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
1341
+ size_t free, total;
1342
+ ggml_backend_dev_memory (dev, &free, &total);
1343
+ printf (" %s: %s (%zu MiB, %zu MiB free)\n " , ggml_backend_dev_name (dev), ggml_backend_dev_description (dev), total / 1024 / 1024 , free / 1024 / 1024 );
1344
+ }
1345
+ }
1346
+ exit (0 );
1347
+ }
1348
+ ));
1315
1349
add_opt (common_arg (
1316
1350
{" -ngl" , " --gpu-layers" , " --n-gpu-layers" }, " N" ,
1317
1351
" number of layers to store in VRAM" ,
@@ -1336,10 +1370,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1336
1370
} else if (arg_next == " layer" ) {
1337
1371
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
1338
1372
} else if (arg_next == " row" ) {
1339
- #ifdef GGML_USE_SYCL
1340
- fprintf (stderr, " warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\n Exit!\n " );
1341
- exit (1 );
1342
- #endif // GGML_USE_SYCL
1343
1373
params.split_mode = LLAMA_SPLIT_MODE_ROW;
1344
1374
} else {
1345
1375
throw std::invalid_argument (" invalid value" );
@@ -2042,6 +2072,25 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2042
2072
params.speculative .n_ctx = value;
2043
2073
}
2044
2074
).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2075
+ add_opt (common_arg (
2076
+ {" -devd" , " --device-draft" }, " <dev1,dev2,..>" ,
2077
+ " comma-separated list of devices to use for offloading the draft model\n "
2078
+ " use --list-devices to see a list of available devices" ,
2079
+ [](common_params & params, const std::string & value) {
2080
+ auto devices = string_split<std::string>(value, ' ,' );
2081
+ if (devices.empty ()) {
2082
+ throw std::invalid_argument (" no devices specified" );
2083
+ }
2084
+ for (const auto & device : devices) {
2085
+ auto * dev = ggml_backend_dev_by_name (device.c_str ());
2086
+ if (!dev || ggml_backend_dev_type (dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
2087
+ throw std::invalid_argument (string_format (" invalid device: %s" , device.c_str ()));
2088
+ }
2089
+ params.speculative .devices .push_back (dev);
2090
+ }
2091
+ params.speculative .devices .push_back (nullptr );
2092
+ }
2093
+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2045
2094
add_opt (common_arg (
2046
2095
{" -ngld" , " --gpu-layers-draft" , " --n-gpu-layers-draft" }, " N" ,
2047
2096
" number of layers to store in VRAM for the draft model" ,
0 commit comments