Skip to content

Commit 9fd8ec7

Browse files
committed
opal/mca/accelerator: introduce get_device_pci_attr api
Introduce get_device_pci_attr api to query accelerator device PCI attributes. This enables intelligent selection of other PCI(e) devices based on affinity with the accelerator, e.g. NICs. Signed-off-by: Wenduo Wang <[email protected]>
1 parent 5ea8638 commit 9fd8ec7

File tree

4 files changed

+78
-4
lines changed

4 files changed

+78
-4
lines changed

opal/mca/accelerator/accelerator.h

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,7 @@
22
* Copyright (c) 2014-2021 Intel, Inc. All rights reserved.
33
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
44
* reserved.
5-
* Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates.
6-
* All Rights reserved.
5+
* Copyright (c) Amazon.com, Inc. or its affiliates. All Rights reserved.
76
* $COPYRIGHT$
87
*
98
* Additional copyrights may follow
@@ -110,6 +109,15 @@ struct opal_accelerator_stream_t {
110109
void *stream;
111110
};
112111
typedef struct opal_accelerator_stream_t opal_accelerator_stream_t;
112+
113+
struct opal_accelerator_pci_attr_t {
114+
uint16_t domain_id;
115+
uint8_t bus_id;
116+
uint8_t device_id;
117+
uint8_t function_id;
118+
};
119+
typedef struct opal_accelerator_pci_attr_t opal_accelerator_pci_attr_t;
120+
113121
OBJ_CLASS_DECLARATION(opal_accelerator_stream_t);
114122

115123
struct opal_accelerator_event_t {
@@ -346,6 +354,17 @@ typedef int (*opal_accelerator_base_module_host_unregister_fn_t)(
346354
typedef int (*opal_accelerator_base_module_get_device_fn_t)(
347355
int *dev_id);
348356

357+
/**
358+
* Retrieves PCI attributes of an accelerator device.
359+
*
360+
* @param[int] dev_id Accelerator device id
361+
* @param[out] pci_attr PCI attributes of the requested device
362+
*
363+
* @return OPAL_SUCCESS or error status on failure
364+
*/
365+
typedef int (*opal_accelerator_base_module_get_device_pci_attr_fn_t)(
366+
int dev_id, opal_accelerator_pci_attr_t *pci_attr);
367+
349368
/**
350369
* Queries if a device may directly access a peer device's memory.
351370
*
@@ -398,6 +417,7 @@ typedef struct {
398417
opal_accelerator_base_module_host_unregister_fn_t host_unregister;
399418

400419
opal_accelerator_base_module_get_device_fn_t get_device;
420+
opal_accelerator_base_module_get_device_pci_attr_fn_t get_device_pci_attr;
401421
opal_accelerator_base_module_device_can_access_peer_fn_t device_can_access_peer;
402422

403423
opal_accelerator_base_module_get_buffer_id_fn_t get_buffer_id;

opal/mca/accelerator/cuda/accelerator_cuda.c

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@
44
* and Technology (RIST). All rights reserved.
55
* Copyright (c) 2014 Mellanox Technologies, Inc.
66
* All rights reserved.
7-
* Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates.
8-
* All Rights reserved.
7+
* Copyright (c) Amazon.com, Inc. or its affiliates. All Rights reserved.
98
* $COPYRIGHT$
109
*
1110
* Additional copyrights may follow
@@ -45,6 +44,7 @@ static int accelerator_cuda_host_register(int dev_id, void *ptr, size_t size);
4544
static int accelerator_cuda_host_unregister(int dev_id, void *ptr);
4645

4746
static int accelerator_cuda_get_device(int *dev_id);
47+
static int accelerator_cuda_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr);
4848
static int accelerator_cuda_device_can_access_peer( int *access, int dev1, int dev2);
4949

5050
static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
@@ -70,6 +70,7 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =
7070
accelerator_cuda_host_unregister,
7171

7272
accelerator_cuda_get_device,
73+
accelerator_cuda_get_device_pci_attr,
7374
accelerator_cuda_device_can_access_peer,
7475

7576
accelerator_cuda_get_buffer_id
@@ -578,6 +579,45 @@ static int accelerator_cuda_get_device(int *dev_id)
578579
return 0;
579580
}
580581

582+
static int accelerator_cuda_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr)
583+
{
584+
CUresult result;
585+
int ret;
586+
static const int PCI_BUS_ID_LENGTH = 13;
587+
char pci_bus_id[PCI_BUS_ID_LENGTH];
588+
char domain_id[5] = {0}, bus_id[3] = {0}, device_id[3] = {0}, function_id[2] = {0};
589+
590+
if (NULL == pci_attr) {
591+
return OPAL_ERR_BAD_PARAM;
592+
}
593+
594+
result = cuDeviceGetPCIBusId(pci_bus_id, PCI_BUS_ID_LENGTH, dev_id);
595+
596+
if (CUDA_SUCCESS != result) {
597+
opal_output_verbose(5, opal_accelerator_base_framework.framework_output,
598+
"CUDA: Failed to get device PCI bus id\n");
599+
return OPAL_ERROR;
600+
}
601+
602+
ret = sscanf(pci_bus_id, "%4s:%2s:%2s.%s", domain_id, bus_id, device_id, function_id);
603+
if (4 > ret) {
604+
opal_output_verbose(5, opal_accelerator_base_framework.framework_output,
605+
"CUDA: Failed to parse device PCI bus id\n");
606+
return OPAL_ERROR;
607+
}
608+
609+
errno = 0;
610+
pci_attr->domain_id = strtol(domain_id, NULL, 16);
611+
pci_attr->bus_id = strtol(bus_id, NULL, 16);
612+
pci_attr->device_id = strtol(device_id, NULL, 16);
613+
pci_attr->function_id = strtol(function_id, NULL, 16);
614+
if (0 != errno) {
615+
return OPAL_ERROR;
616+
}
617+
618+
return OPAL_SUCCESS;
619+
}
620+
581621
static int accelerator_cuda_device_can_access_peer(int *access, int dev1, int dev2)
582622
{
583623
CUresult result;

opal/mca/accelerator/null/accelerator_null_component.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ static int accelerator_null_host_register(int dev_id, void *ptr, size_t size);
5959
static int accelerator_null_host_unregister(int dev_id, void *ptr);
6060

6161
static int accelerator_null_get_device(int *dev_id);
62+
static int accelerator_null_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr);
6263
static int accelerator_null_device_can_access_peer(int *access, int dev1, int dev2);
6364

6465
static int accelerator_null_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
@@ -122,6 +123,7 @@ opal_accelerator_base_module_t opal_accelerator_null_module =
122123
accelerator_null_host_unregister,
123124

124125
accelerator_null_get_device,
126+
accelerator_null_get_device_pci_attr,
125127
accelerator_null_device_can_access_peer,
126128

127129
accelerator_null_get_buffer_id
@@ -235,6 +237,11 @@ static int accelerator_null_get_device(int *dev_id)
235237
return OPAL_ERR_NOT_IMPLEMENTED;
236238
}
237239

240+
static int accelerator_null_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr)
241+
{
242+
return OPAL_ERR_NOT_IMPLEMENTED;
243+
}
244+
238245
static int accelerator_null_device_can_access_peer( int *access, int dev1, int dev2)
239246
{
240247
return OPAL_ERR_NOT_IMPLEMENTED;

opal/mca/accelerator/rocm/accelerator_rocm_module.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ static int mca_accelerator_rocm_host_register(int dev_id, void *ptr, size_t size
3737
static int mca_accelerator_rocm_host_unregister(int dev_id, void *ptr);
3838

3939
static int mca_accelerator_rocm_get_device(int *dev_id);
40+
static int mca_accelerator_rocm_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr);
4041
static int mca_accelerator_rocm_device_can_access_peer( int *access, int dev1, int dev2);
4142

4243
static int mca_accelerator_rocm_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
@@ -62,6 +63,7 @@ opal_accelerator_base_module_t opal_accelerator_rocm_module =
6263
mca_accelerator_rocm_host_unregister,
6364

6465
mca_accelerator_rocm_get_device,
66+
mca_accelerator_rocm_get_device_pci_attr,
6567
mca_accelerator_rocm_device_can_access_peer,
6668

6769
mca_accelerator_rocm_get_buffer_id
@@ -476,6 +478,11 @@ static int mca_accelerator_rocm_get_device(int *dev_id)
476478
return OPAL_SUCCESS;
477479
}
478480

481+
static int mca_accelerator_rocm_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr)
482+
{
483+
return OPAL_ERR_NOT_IMPLEMENTED;
484+
}
485+
479486
static int mca_accelerator_rocm_device_can_access_peer(int *access, int dev1, int dev2)
480487
{
481488
if (NULL == access || dev1 < 0 || dev2 < 0){

0 commit comments

Comments
 (0)