Skip to content

opal/mca/accelerator: introduce get_device_pci_attr api #11687

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion opal/mca/accelerator/accelerator.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* Copyright (c) 2014-2021 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates.
* Copyright (c) Amazon.com, Inc. or its affiliates.
* All Rights reserved.
* $COPYRIGHT$
*
Expand Down Expand Up @@ -110,6 +110,15 @@ struct opal_accelerator_stream_t {
void *stream;
};
typedef struct opal_accelerator_stream_t opal_accelerator_stream_t;

struct opal_accelerator_pci_attr_t {
uint16_t domain_id;
uint8_t bus_id;
uint8_t device_id;
uint8_t function_id;
};
typedef struct opal_accelerator_pci_attr_t opal_accelerator_pci_attr_t;

OBJ_CLASS_DECLARATION(opal_accelerator_stream_t);

struct opal_accelerator_event_t {
Expand Down Expand Up @@ -346,6 +355,17 @@ typedef int (*opal_accelerator_base_module_host_unregister_fn_t)(
typedef int (*opal_accelerator_base_module_get_device_fn_t)(
int *dev_id);

/**
* Retrieves PCI attributes of an accelerator device.
*
* @param[int] dev_id Accelerator device id
* @param[out] pci_attr PCI attributes of the requested device
*
* @return OPAL_SUCCESS or error status on failure
*/
typedef int (*opal_accelerator_base_module_get_device_pci_attr_fn_t)(
int dev_id, opal_accelerator_pci_attr_t *pci_attr);

/**
* Queries if a device may directly access a peer device's memory.
*
Expand Down Expand Up @@ -398,6 +418,7 @@ typedef struct {
opal_accelerator_base_module_host_unregister_fn_t host_unregister;

opal_accelerator_base_module_get_device_fn_t get_device;
opal_accelerator_base_module_get_device_pci_attr_fn_t get_device_pci_attr;
opal_accelerator_base_module_device_can_access_peer_fn_t device_can_access_peer;

opal_accelerator_base_module_get_buffer_id_fn_t get_buffer_id;
Expand Down
43 changes: 42 additions & 1 deletion opal/mca/accelerator/cuda/accelerator_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
* and Technology (RIST). All rights reserved.
* Copyright (c) 2014 Mellanox Technologies, Inc.
* All rights reserved.
* Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates.
* Copyright (c) Amazon.com, Inc. or its affiliates.
* All Rights reserved.
* $COPYRIGHT$
*
Expand Down Expand Up @@ -45,6 +45,7 @@ static int accelerator_cuda_host_register(int dev_id, void *ptr, size_t size);
static int accelerator_cuda_host_unregister(int dev_id, void *ptr);

static int accelerator_cuda_get_device(int *dev_id);
static int accelerator_cuda_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr);
static int accelerator_cuda_device_can_access_peer( int *access, int dev1, int dev2);

static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
Expand All @@ -70,6 +71,7 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =
accelerator_cuda_host_unregister,

accelerator_cuda_get_device,
accelerator_cuda_get_device_pci_attr,
accelerator_cuda_device_can_access_peer,

accelerator_cuda_get_buffer_id
Expand Down Expand Up @@ -578,6 +580,45 @@ static int accelerator_cuda_get_device(int *dev_id)
return 0;
}

static int accelerator_cuda_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr)
{
CUresult result;
int ret;
static const int PCI_BUS_ID_LENGTH = 13;
char pci_bus_id[PCI_BUS_ID_LENGTH];
char domain_id[5] = {0}, bus_id[3] = {0}, device_id[3] = {0}, function_id[2] = {0};

if (NULL == pci_attr) {
return OPAL_ERR_BAD_PARAM;
}

result = cuDeviceGetPCIBusId(pci_bus_id, PCI_BUS_ID_LENGTH, dev_id);

if (CUDA_SUCCESS != result) {
opal_output_verbose(5, opal_accelerator_base_framework.framework_output,
"CUDA: Failed to get device PCI bus id");
return OPAL_ERROR;
}

ret = sscanf(pci_bus_id, "%4s:%2s:%2s.%1s", domain_id, bus_id, device_id, function_id);
if (4 > ret) {
opal_output_verbose(5, opal_accelerator_base_framework.framework_output,
"CUDA: Failed to parse device PCI bus id");
return OPAL_ERROR;
}

errno = 0;
pci_attr->domain_id = strtol(domain_id, NULL, 16);
pci_attr->bus_id = strtol(bus_id, NULL, 16);
pci_attr->device_id = strtol(device_id, NULL, 16);
pci_attr->function_id = strtol(function_id, NULL, 16);
if (0 != errno) {
return OPAL_ERROR;
}

return OPAL_SUCCESS;
}

static int accelerator_cuda_device_can_access_peer(int *access, int dev1, int dev2)
{
CUresult result;
Expand Down
9 changes: 8 additions & 1 deletion opal/mca/accelerator/null/accelerator_null_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
* All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates.
* Copyright (c) Amazon.com, Inc. or its affiliates.
* All Rights reserved.
* $COPYRIGHT$
*
Expand Down Expand Up @@ -59,6 +59,7 @@ static int accelerator_null_host_register(int dev_id, void *ptr, size_t size);
static int accelerator_null_host_unregister(int dev_id, void *ptr);

static int accelerator_null_get_device(int *dev_id);
static int accelerator_null_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr);
static int accelerator_null_device_can_access_peer(int *access, int dev1, int dev2);

static int accelerator_null_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
Expand Down Expand Up @@ -122,6 +123,7 @@ opal_accelerator_base_module_t opal_accelerator_null_module =
accelerator_null_host_unregister,

accelerator_null_get_device,
accelerator_null_get_device_pci_attr,
accelerator_null_device_can_access_peer,

accelerator_null_get_buffer_id
Expand Down Expand Up @@ -235,6 +237,11 @@ static int accelerator_null_get_device(int *dev_id)
return OPAL_ERR_NOT_IMPLEMENTED;
}

static int accelerator_null_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr)
{
return OPAL_ERR_NOT_IMPLEMENTED;
}

static int accelerator_null_device_can_access_peer( int *access, int dev1, int dev2)
{
return OPAL_ERR_NOT_IMPLEMENTED;
Expand Down
40 changes: 40 additions & 0 deletions opal/mca/accelerator/rocm/accelerator_rocm_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ static int mca_accelerator_rocm_host_register(int dev_id, void *ptr, size_t size
static int mca_accelerator_rocm_host_unregister(int dev_id, void *ptr);

static int mca_accelerator_rocm_get_device(int *dev_id);
static int mca_accelerator_rocm_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr);
static int mca_accelerator_rocm_device_can_access_peer( int *access, int dev1, int dev2);

static int mca_accelerator_rocm_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
Expand All @@ -62,6 +63,7 @@ opal_accelerator_base_module_t opal_accelerator_rocm_module =
mca_accelerator_rocm_host_unregister,

mca_accelerator_rocm_get_device,
mca_accelerator_rocm_get_device_pci_attr,
mca_accelerator_rocm_device_can_access_peer,

mca_accelerator_rocm_get_buffer_id
Expand Down Expand Up @@ -476,6 +478,44 @@ static int mca_accelerator_rocm_get_device(int *dev_id)
return OPAL_SUCCESS;
}

static int mca_accelerator_rocm_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr)
{
hipError_t err;
int ret;
static const int PCI_BUS_ID_LENGTH = 13;
char pci_bus_id[PCI_BUS_ID_LENGTH];
char domain_id[5] = {0}, bus_id[3] = {0}, device_id[3] = {0}, function_id[2] = {0};

if (NULL == pci_attr) {
return OPAL_ERR_BAD_PARAM;
}

err = hipDeviceGetPCIBusId(pci_bus_id, PCI_BUS_ID_LENGTH, dev_id);
if(hipSuccess != err) {
opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
"error retrieving device PCI attributes");
return OPAL_ERROR;
}

ret = sscanf(pci_bus_id, "%4s:%2s:%2s.%1s", domain_id, bus_id, device_id, function_id);
if (4 > ret) {
opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
"error parsing device PCI attributes");
return OPAL_ERROR;
}

errno = 0;
pci_attr->domain_id = strtol(domain_id, NULL, 16);
pci_attr->bus_id = strtol(bus_id, NULL, 16);
pci_attr->device_id = strtol(device_id, NULL, 16);
pci_attr->function_id = strtol(function_id, NULL, 16);
if (0 != errno) {
return OPAL_ERROR;
}

return OPAL_SUCCESS;
}

static int mca_accelerator_rocm_device_can_access_peer(int *access, int dev1, int dev2)
{
if (NULL == access || dev1 < 0 || dev2 < 0){
Expand Down