diff --git a/opal/mca/accelerator/accelerator.h b/opal/mca/accelerator/accelerator.h index e283ef25808..efc951377ca 100644 --- a/opal/mca/accelerator/accelerator.h +++ b/opal/mca/accelerator/accelerator.h @@ -2,7 +2,7 @@ * Copyright (c) 2014-2021 Intel, Inc. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates. + * Copyright (c) Amazon.com, Inc. or its affiliates. * All Rights reserved. * $COPYRIGHT$ * @@ -110,6 +110,15 @@ struct opal_accelerator_stream_t { void *stream; }; typedef struct opal_accelerator_stream_t opal_accelerator_stream_t; + +struct opal_accelerator_pci_attr_t { + uint16_t domain_id; + uint8_t bus_id; + uint8_t device_id; + uint8_t function_id; +}; +typedef struct opal_accelerator_pci_attr_t opal_accelerator_pci_attr_t; + OBJ_CLASS_DECLARATION(opal_accelerator_stream_t); struct opal_accelerator_event_t { @@ -346,6 +355,17 @@ typedef int (*opal_accelerator_base_module_host_unregister_fn_t)( typedef int (*opal_accelerator_base_module_get_device_fn_t)( int *dev_id); +/** + * Retrieves PCI attributes of an accelerator device. + * + * @param[int] dev_id Accelerator device id + * @param[out] pci_attr PCI attributes of the requested device + * + * @return OPAL_SUCCESS or error status on failure + */ +typedef int (*opal_accelerator_base_module_get_device_pci_attr_fn_t)( + int dev_id, opal_accelerator_pci_attr_t *pci_attr); + /** * Queries if a device may directly access a peer device's memory. * @@ -398,6 +418,7 @@ typedef struct { opal_accelerator_base_module_host_unregister_fn_t host_unregister; opal_accelerator_base_module_get_device_fn_t get_device; + opal_accelerator_base_module_get_device_pci_attr_fn_t get_device_pci_attr; opal_accelerator_base_module_device_can_access_peer_fn_t device_can_access_peer; opal_accelerator_base_module_get_buffer_id_fn_t get_buffer_id; diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c index 95bcb82b535..49d181a0b00 100644 --- a/opal/mca/accelerator/cuda/accelerator_cuda.c +++ b/opal/mca/accelerator/cuda/accelerator_cuda.c @@ -4,7 +4,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2014 Mellanox Technologies, Inc. * All rights reserved. - * Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates. + * Copyright (c) Amazon.com, Inc. or its affiliates. * All Rights reserved. * $COPYRIGHT$ * @@ -45,6 +45,7 @@ static int accelerator_cuda_host_register(int dev_id, void *ptr, size_t size); static int accelerator_cuda_host_unregister(int dev_id, void *ptr); static int accelerator_cuda_get_device(int *dev_id); +static int accelerator_cuda_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr); static int accelerator_cuda_device_can_access_peer( int *access, int dev1, int dev2); static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id); @@ -70,6 +71,7 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module = accelerator_cuda_host_unregister, accelerator_cuda_get_device, + accelerator_cuda_get_device_pci_attr, accelerator_cuda_device_can_access_peer, accelerator_cuda_get_buffer_id @@ -578,6 +580,45 @@ static int accelerator_cuda_get_device(int *dev_id) return 0; } +static int accelerator_cuda_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr) +{ + CUresult result; + int ret; + static const int PCI_BUS_ID_LENGTH = 13; + char pci_bus_id[PCI_BUS_ID_LENGTH]; + char domain_id[5] = {0}, bus_id[3] = {0}, device_id[3] = {0}, function_id[2] = {0}; + + if (NULL == pci_attr) { + return OPAL_ERR_BAD_PARAM; + } + + result = cuDeviceGetPCIBusId(pci_bus_id, PCI_BUS_ID_LENGTH, dev_id); + + if (CUDA_SUCCESS != result) { + opal_output_verbose(5, opal_accelerator_base_framework.framework_output, + "CUDA: Failed to get device PCI bus id"); + return OPAL_ERROR; + } + + ret = sscanf(pci_bus_id, "%4s:%2s:%2s.%1s", domain_id, bus_id, device_id, function_id); + if (4 > ret) { + opal_output_verbose(5, opal_accelerator_base_framework.framework_output, + "CUDA: Failed to parse device PCI bus id"); + return OPAL_ERROR; + } + + errno = 0; + pci_attr->domain_id = strtol(domain_id, NULL, 16); + pci_attr->bus_id = strtol(bus_id, NULL, 16); + pci_attr->device_id = strtol(device_id, NULL, 16); + pci_attr->function_id = strtol(function_id, NULL, 16); + if (0 != errno) { + return OPAL_ERROR; + } + + return OPAL_SUCCESS; +} + static int accelerator_cuda_device_can_access_peer(int *access, int dev1, int dev2) { CUresult result; diff --git a/opal/mca/accelerator/null/accelerator_null_component.c b/opal/mca/accelerator/null/accelerator_null_component.c index 4c4d47495be..4a0d307497b 100644 --- a/opal/mca/accelerator/null/accelerator_null_component.c +++ b/opal/mca/accelerator/null/accelerator_null_component.c @@ -6,7 +6,7 @@ * All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates. + * Copyright (c) Amazon.com, Inc. or its affiliates. * All Rights reserved. * $COPYRIGHT$ * @@ -59,6 +59,7 @@ static int accelerator_null_host_register(int dev_id, void *ptr, size_t size); static int accelerator_null_host_unregister(int dev_id, void *ptr); static int accelerator_null_get_device(int *dev_id); +static int accelerator_null_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr); static int accelerator_null_device_can_access_peer(int *access, int dev1, int dev2); static int accelerator_null_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id); @@ -122,6 +123,7 @@ opal_accelerator_base_module_t opal_accelerator_null_module = accelerator_null_host_unregister, accelerator_null_get_device, + accelerator_null_get_device_pci_attr, accelerator_null_device_can_access_peer, accelerator_null_get_buffer_id @@ -235,6 +237,11 @@ static int accelerator_null_get_device(int *dev_id) return OPAL_ERR_NOT_IMPLEMENTED; } +static int accelerator_null_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr) +{ + return OPAL_ERR_NOT_IMPLEMENTED; +} + static int accelerator_null_device_can_access_peer( int *access, int dev1, int dev2) { return OPAL_ERR_NOT_IMPLEMENTED; diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_module.c b/opal/mca/accelerator/rocm/accelerator_rocm_module.c index c0cf8afad36..e3425480647 100644 --- a/opal/mca/accelerator/rocm/accelerator_rocm_module.c +++ b/opal/mca/accelerator/rocm/accelerator_rocm_module.c @@ -37,6 +37,7 @@ static int mca_accelerator_rocm_host_register(int dev_id, void *ptr, size_t size static int mca_accelerator_rocm_host_unregister(int dev_id, void *ptr); static int mca_accelerator_rocm_get_device(int *dev_id); +static int mca_accelerator_rocm_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr); static int mca_accelerator_rocm_device_can_access_peer( int *access, int dev1, int dev2); static int mca_accelerator_rocm_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id); @@ -62,6 +63,7 @@ opal_accelerator_base_module_t opal_accelerator_rocm_module = mca_accelerator_rocm_host_unregister, mca_accelerator_rocm_get_device, + mca_accelerator_rocm_get_device_pci_attr, mca_accelerator_rocm_device_can_access_peer, mca_accelerator_rocm_get_buffer_id @@ -476,6 +478,44 @@ static int mca_accelerator_rocm_get_device(int *dev_id) return OPAL_SUCCESS; } +static int mca_accelerator_rocm_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr) +{ + hipError_t err; + int ret; + static const int PCI_BUS_ID_LENGTH = 13; + char pci_bus_id[PCI_BUS_ID_LENGTH]; + char domain_id[5] = {0}, bus_id[3] = {0}, device_id[3] = {0}, function_id[2] = {0}; + + if (NULL == pci_attr) { + return OPAL_ERR_BAD_PARAM; + } + + err = hipDeviceGetPCIBusId(pci_bus_id, PCI_BUS_ID_LENGTH, dev_id); + if(hipSuccess != err) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "error retrieving device PCI attributes"); + return OPAL_ERROR; + } + + ret = sscanf(pci_bus_id, "%4s:%2s:%2s.%1s", domain_id, bus_id, device_id, function_id); + if (4 > ret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "error parsing device PCI attributes"); + return OPAL_ERROR; + } + + errno = 0; + pci_attr->domain_id = strtol(domain_id, NULL, 16); + pci_attr->bus_id = strtol(bus_id, NULL, 16); + pci_attr->device_id = strtol(device_id, NULL, 16); + pci_attr->function_id = strtol(function_id, NULL, 16); + if (0 != errno) { + return OPAL_ERROR; + } + + return OPAL_SUCCESS; +} + static int mca_accelerator_rocm_device_can_access_peer(int *access, int dev1, int dev2) { if (NULL == access || dev1 < 0 || dev2 < 0){