Skip to content

Commit a1519d2

Browse files
authored
Merge pull request #11687 from wenduwan/accelerator_awareness
opal/mca/accelerator: introduce get_device_pci_attr api
2 parents 2a52280 + 00a567e commit a1519d2

File tree

4 files changed

+112
-3
lines changed

4 files changed

+112
-3
lines changed

opal/mca/accelerator/accelerator.h

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
* Copyright (c) 2014-2021 Intel, Inc. All rights reserved.
33
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
44
* reserved.
5-
* Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates.
5+
* Copyright (c) Amazon.com, Inc. or its affiliates.
66
* All Rights reserved.
77
* $COPYRIGHT$
88
*
@@ -110,6 +110,15 @@ struct opal_accelerator_stream_t {
110110
void *stream;
111111
};
112112
typedef struct opal_accelerator_stream_t opal_accelerator_stream_t;
113+
114+
struct opal_accelerator_pci_attr_t {
115+
uint16_t domain_id;
116+
uint8_t bus_id;
117+
uint8_t device_id;
118+
uint8_t function_id;
119+
};
120+
typedef struct opal_accelerator_pci_attr_t opal_accelerator_pci_attr_t;
121+
113122
OBJ_CLASS_DECLARATION(opal_accelerator_stream_t);
114123

115124
struct opal_accelerator_event_t {
@@ -346,6 +355,17 @@ typedef int (*opal_accelerator_base_module_host_unregister_fn_t)(
346355
typedef int (*opal_accelerator_base_module_get_device_fn_t)(
347356
int *dev_id);
348357

358+
/**
359+
* Retrieves PCI attributes of an accelerator device.
360+
*
361+
* @param[int] dev_id Accelerator device id
362+
* @param[out] pci_attr PCI attributes of the requested device
363+
*
364+
* @return OPAL_SUCCESS or error status on failure
365+
*/
366+
typedef int (*opal_accelerator_base_module_get_device_pci_attr_fn_t)(
367+
int dev_id, opal_accelerator_pci_attr_t *pci_attr);
368+
349369
/**
350370
* Queries if a device may directly access a peer device's memory.
351371
*
@@ -398,6 +418,7 @@ typedef struct {
398418
opal_accelerator_base_module_host_unregister_fn_t host_unregister;
399419

400420
opal_accelerator_base_module_get_device_fn_t get_device;
421+
opal_accelerator_base_module_get_device_pci_attr_fn_t get_device_pci_attr;
401422
opal_accelerator_base_module_device_can_access_peer_fn_t device_can_access_peer;
402423

403424
opal_accelerator_base_module_get_buffer_id_fn_t get_buffer_id;

opal/mca/accelerator/cuda/accelerator_cuda.c

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
* and Technology (RIST). All rights reserved.
55
* Copyright (c) 2014 Mellanox Technologies, Inc.
66
* All rights reserved.
7-
* Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates.
7+
* Copyright (c) Amazon.com, Inc. or its affiliates.
88
* All Rights reserved.
99
* $COPYRIGHT$
1010
*
@@ -45,6 +45,7 @@ static int accelerator_cuda_host_register(int dev_id, void *ptr, size_t size);
4545
static int accelerator_cuda_host_unregister(int dev_id, void *ptr);
4646

4747
static int accelerator_cuda_get_device(int *dev_id);
48+
static int accelerator_cuda_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr);
4849
static int accelerator_cuda_device_can_access_peer( int *access, int dev1, int dev2);
4950

5051
static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
@@ -70,6 +71,7 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =
7071
accelerator_cuda_host_unregister,
7172

7273
accelerator_cuda_get_device,
74+
accelerator_cuda_get_device_pci_attr,
7375
accelerator_cuda_device_can_access_peer,
7476

7577
accelerator_cuda_get_buffer_id
@@ -578,6 +580,45 @@ static int accelerator_cuda_get_device(int *dev_id)
578580
return 0;
579581
}
580582

583+
static int accelerator_cuda_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr)
584+
{
585+
CUresult result;
586+
int ret;
587+
static const int PCI_BUS_ID_LENGTH = 13;
588+
char pci_bus_id[PCI_BUS_ID_LENGTH];
589+
char domain_id[5] = {0}, bus_id[3] = {0}, device_id[3] = {0}, function_id[2] = {0};
590+
591+
if (NULL == pci_attr) {
592+
return OPAL_ERR_BAD_PARAM;
593+
}
594+
595+
result = cuDeviceGetPCIBusId(pci_bus_id, PCI_BUS_ID_LENGTH, dev_id);
596+
597+
if (CUDA_SUCCESS != result) {
598+
opal_output_verbose(5, opal_accelerator_base_framework.framework_output,
599+
"CUDA: Failed to get device PCI bus id");
600+
return OPAL_ERROR;
601+
}
602+
603+
ret = sscanf(pci_bus_id, "%4s:%2s:%2s.%1s", domain_id, bus_id, device_id, function_id);
604+
if (4 > ret) {
605+
opal_output_verbose(5, opal_accelerator_base_framework.framework_output,
606+
"CUDA: Failed to parse device PCI bus id");
607+
return OPAL_ERROR;
608+
}
609+
610+
errno = 0;
611+
pci_attr->domain_id = strtol(domain_id, NULL, 16);
612+
pci_attr->bus_id = strtol(bus_id, NULL, 16);
613+
pci_attr->device_id = strtol(device_id, NULL, 16);
614+
pci_attr->function_id = strtol(function_id, NULL, 16);
615+
if (0 != errno) {
616+
return OPAL_ERROR;
617+
}
618+
619+
return OPAL_SUCCESS;
620+
}
621+
581622
static int accelerator_cuda_device_can_access_peer(int *access, int dev1, int dev2)
582623
{
583624
CUresult result;

opal/mca/accelerator/null/accelerator_null_component.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
* All rights reserved.
77
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
88
* reserved.
9-
* Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates.
9+
* Copyright (c) Amazon.com, Inc. or its affiliates.
1010
* All Rights reserved.
1111
* $COPYRIGHT$
1212
*
@@ -59,6 +59,7 @@ static int accelerator_null_host_register(int dev_id, void *ptr, size_t size);
5959
static int accelerator_null_host_unregister(int dev_id, void *ptr);
6060

6161
static int accelerator_null_get_device(int *dev_id);
62+
static int accelerator_null_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr);
6263
static int accelerator_null_device_can_access_peer(int *access, int dev1, int dev2);
6364

6465
static int accelerator_null_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
@@ -122,6 +123,7 @@ opal_accelerator_base_module_t opal_accelerator_null_module =
122123
accelerator_null_host_unregister,
123124

124125
accelerator_null_get_device,
126+
accelerator_null_get_device_pci_attr,
125127
accelerator_null_device_can_access_peer,
126128

127129
accelerator_null_get_buffer_id
@@ -235,6 +237,11 @@ static int accelerator_null_get_device(int *dev_id)
235237
return OPAL_ERR_NOT_IMPLEMENTED;
236238
}
237239

240+
static int accelerator_null_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr)
241+
{
242+
return OPAL_ERR_NOT_IMPLEMENTED;
243+
}
244+
238245
static int accelerator_null_device_can_access_peer( int *access, int dev1, int dev2)
239246
{
240247
return OPAL_ERR_NOT_IMPLEMENTED;

opal/mca/accelerator/rocm/accelerator_rocm_module.c

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ static int mca_accelerator_rocm_host_register(int dev_id, void *ptr, size_t size
3737
static int mca_accelerator_rocm_host_unregister(int dev_id, void *ptr);
3838

3939
static int mca_accelerator_rocm_get_device(int *dev_id);
40+
static int mca_accelerator_rocm_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr);
4041
static int mca_accelerator_rocm_device_can_access_peer( int *access, int dev1, int dev2);
4142

4243
static int mca_accelerator_rocm_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
@@ -62,6 +63,7 @@ opal_accelerator_base_module_t opal_accelerator_rocm_module =
6263
mca_accelerator_rocm_host_unregister,
6364

6465
mca_accelerator_rocm_get_device,
66+
mca_accelerator_rocm_get_device_pci_attr,
6567
mca_accelerator_rocm_device_can_access_peer,
6668

6769
mca_accelerator_rocm_get_buffer_id
@@ -476,6 +478,44 @@ static int mca_accelerator_rocm_get_device(int *dev_id)
476478
return OPAL_SUCCESS;
477479
}
478480

481+
static int mca_accelerator_rocm_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr)
482+
{
483+
hipError_t err;
484+
int ret;
485+
static const int PCI_BUS_ID_LENGTH = 13;
486+
char pci_bus_id[PCI_BUS_ID_LENGTH];
487+
char domain_id[5] = {0}, bus_id[3] = {0}, device_id[3] = {0}, function_id[2] = {0};
488+
489+
if (NULL == pci_attr) {
490+
return OPAL_ERR_BAD_PARAM;
491+
}
492+
493+
err = hipDeviceGetPCIBusId(pci_bus_id, PCI_BUS_ID_LENGTH, dev_id);
494+
if(hipSuccess != err) {
495+
opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
496+
"error retrieving device PCI attributes");
497+
return OPAL_ERROR;
498+
}
499+
500+
ret = sscanf(pci_bus_id, "%4s:%2s:%2s.%1s", domain_id, bus_id, device_id, function_id);
501+
if (4 > ret) {
502+
opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
503+
"error parsing device PCI attributes");
504+
return OPAL_ERROR;
505+
}
506+
507+
errno = 0;
508+
pci_attr->domain_id = strtol(domain_id, NULL, 16);
509+
pci_attr->bus_id = strtol(bus_id, NULL, 16);
510+
pci_attr->device_id = strtol(device_id, NULL, 16);
511+
pci_attr->function_id = strtol(function_id, NULL, 16);
512+
if (0 != errno) {
513+
return OPAL_ERROR;
514+
}
515+
516+
return OPAL_SUCCESS;
517+
}
518+
479519
static int mca_accelerator_rocm_device_can_access_peer(int *access, int dev1, int dev2)
480520
{
481521
if (NULL == access || dev1 < 0 || dev2 < 0){

0 commit comments

Comments
 (0)