NVIDIA
diff --git a/‎ext-net/README.md
+22-8 b/‎ext-net/README.md
+22-8
diff --git a/‎ext-net/example/nccl/net.h
+11-2 b/‎ext-net/example/nccl/net.h
+11-2
diff --git a/‎ext-net/example/nccl/net_device.h
+2-1 b/‎ext-net/example/nccl/net_device.h
+2-1
diff --git a/‎ext-net/example/nccl/net_v10.h
+101 b/‎ext-net/example/nccl/net_v10.h
+101
diff --git a/‎ext-net/example/nccl/net_v2.h
+2-2 b/‎ext-net/example/nccl/net_v2.h
+2-2
diff --git a/‎ext-net/example/nccl/net_v3.h
+2-2 b/‎ext-net/example/nccl/net_v3.h
+2-2
diff --git a/‎ext-net/example/nccl/net_v4.h
+2-2 b/‎ext-net/example/nccl/net_v4.h
+2-2
diff --git a/‎ext-net/example/nccl/net_v5.h
+2-2 b/‎ext-net/example/nccl/net_v5.h
+2-2
diff --git a/‎ext-net/example/nccl/net_v6.h
+2-4 b/‎ext-net/example/nccl/net_v6.h
+2-4
diff --git a/‎ext-net/example/nccl/net_v7.h
+2-4 b/‎ext-net/example/nccl/net_v7.h
+2-4
diff --git a/‎ext-net/example/nccl/net_v8.h
+2-4 b/‎ext-net/example/nccl/net_v8.h
+2-4
diff --git a/‎ext-net/example/nccl/net_v9.h
+3-9 b/‎ext-net/example/nccl/net_v9.h
+3-9
@@ -60,20 +60,20 @@ of newer ones.
 The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions
 from old API versions. It also provides error codes in `err.h`.
 
-# API (v9)
+# API (v10)
 
-Below is the main `ncclNet_v9` struct. Each function is explained in later sections.
+Below is the main `ncclNet_v10` struct. Each function is explained in later sections.
 
 ```
 typedef struct {
   // Name of the network (mainly for logs)
   const char* name;
   // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
   // Return the number of adapters.
   ncclResult_t (*devices)(int* ndev);
   // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
   // Create a receiving object and provide a handle to connect to it. The
   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
   // between ranks to create a connection.
@@ -83,13 +83,13 @@ typedef struct {
   // should return successfully with sendComm == NULL with the expectation that
   // it will be called again until sendComm != NULL.
   // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
+  ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
   // Finalize connection establishment after remote peer has called connect.
   // This call must not block for the connection to be established, and instead
   // should return successfully with recvComm == NULL with the expectation that
   // it will be called again until recvComm != NULL.
   // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
   ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
@@ -98,10 +98,10 @@ typedef struct {
   ncclResult_t (*deregMr)(void* comm, void* mhandle);
   // Asynchronous send to a peer.
   // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request);
   // Asynchronous recv from a peer.
   // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request);
   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
   // visible to the GPU
   ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
@@ -200,6 +200,9 @@ the plugin code adding the following definitions:
 #define INFO(FLAGS, ...) logFunction(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
 ```
 
+The `ncclProfilerCallback_t` argument is a NCCL core callback that allows the plugin to define and
+record its own events with the NCCL profiler plugin.
+
 `devices`
 
 Once the plugin is initialized, NCCL will query the number of devices available. It should not
@@ -301,6 +304,11 @@ the `listen` call previously. If the sender did not connect yet, `accept` should
 should return `ncclSuccess`, setting `recvComm` to `NULL`. NCCL will call `accept` again until it
 succeeds.
 
+The `connect` API takes a `ncclNetCommConfig_t`, which contains a trafficClass field.
+This field can be used by the network plugin to specify the QoS level of the connection. By default,
+`trafficClass` is set to -1 but can be configured by the application during communicator initialization
+to select a plugin-supported QoS level.
+
 `closeListen`/`closeSend`/`closeRecv`
 
 Once a `listenComm`/`sendComm`/`recvComm` is no longer needed, NCCL will call
@@ -354,6 +362,9 @@ The `isend` operation returns a handle in the `request` argument for further cal
 the `isend` operation cannot be initiated, `request` can be set to `NULL` and NCCL will call
 `isend` again later.
 
+The `pHandle` argument allows NCCL to pass an opaque handle that can be used by the network plugin
+to support network defined events.
+
 `irecv`
 
 To receive data, NCCL will call `irecv` with the `recvComm` returned by `accept`. The argument
@@ -375,6 +386,9 @@ of irecv and is resilient to redundant network writes. This allows the plugin to
 completions on such irecvs (for example, complete the request immediately). The plugin is still
 expected to set a valid request pointer on return which NCCL can poll to check for completion.
 
+The `pHandle` argument allows NCCL to pass an array of opaque handles that can be used by the
+network plugin to support network defined events.
+
 Note: for a given connection, send/receive operations should always match in the order they were
 posted. Tags provided for receive operations are only used to assign a given send operation to one
 of the buffers of the first (multi-)receive in the queue, not to allow for out-of-order tag
 
@@ -2,14 +2,15 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_H_
-#define NCCL_NET_H_
+#ifndef NET_H_
+#define NET_H_
 
 #include <stdint.h>
 #include <stdlib.h>
 
 #include "common.h"
 #include "err.h"
+#include "net_device.h"
 
 #define NCCL_NET_HANDLE_MAXSIZE 128
 #define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) //1TB
@@ -22,6 +23,9 @@
 // Maximum number of requests per comm object
 #define NCCL_NET_MAX_REQUESTS 32
 
+typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData);
+
+#include "net_v10.h"
 #include "net_v9.h"
 #include "net_v8.h"
 #include "net_v7.h"
@@ -31,4 +35,9 @@
 #include "net_v3.h"
 #include "net_v2.h"
 
+typedef ncclNet_v10_t ncclNet_t;
+typedef ncclNetProperties_v10_t ncclNetProperties_t;
+typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
+typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
+
 #endif // end include guard
@@ -26,6 +26,7 @@ typedef struct {
 
 typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
 typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
-typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t;
+typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
+typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;
 
 #endif
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V10_H_
+#define NET_V10_H_
+
+#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10];
+} ncclNetVDeviceProps_v10_t;
+
+
+#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1
+typedef struct {
+  // Plugin-specific TC value
+  int trafficClass;
+} ncclNetCommConfig_v10_t;
+
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v10_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+} ncclNetProperties_v10_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props);
+} ncclNet_v10_t;
+
+#endif // end include guard
@@ -2,8 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V2_H_
-#define NCCL_NET_V2_H_
+#ifndef NET_V2_H_
+#define NET_V2_H_
 
 typedef struct {
   // Name of the network (mainly for logs)
 
@@ -2,8 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V3_H_
-#define NCCL_NET_V3_H_
+#ifndef NET_V3_H_
+#define NET_V3_H_
 
 #define NCCL_NET_MAX_REQUESTS_V3 16
 
 
@@ -2,8 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V4_H_
-#define NCCL_NET_V4_H_
+#ifndef NET_V4_H_
+#define NET_V4_H_
 
 #define NCCL_NET_HANDLE_MAXSIZE_V4 64
 
 
@@ -2,8 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V5_H_
-#define NCCL_NET_V5_H_
+#ifndef NET_V5_H_
+#define NET_V5_H_
 
 typedef ncclNetProperties_v6_t ncclNetProperties_v5_t;
 typedef struct {
 
@@ -2,10 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V6_H_
-#define NCCL_NET_V6_H_
-
-#define NCCL_NET_MAX_REQUESTS_V6 8
+#ifndef NET_V6_H_
+#define NET_V6_H_
 
 typedef struct {
   char* name;     // Used mostly for logging.
 
@@ -2,10 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V7_H_
-#define NCCL_NET_V7_H_
-
-#include "net_device.h"
+#ifndef NET_V7_H_
+#define NET_V7_H_
 
 typedef struct {
   char* name;                      // Used mostly for logging.
 
@@ -2,10 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V8_H_
-#define NCCL_NET_V8_H_
-
-#include "net_device.h"
+#ifndef NET_V8_H_
+#define NET_V8_H_
 
 typedef struct {
   char* name;                      // Used mostly for logging.
 
@@ -2,18 +2,14 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V9_H_
-#define NCCL_NET_V9_H_
-
-#include "net_device.h"
+#ifndef NET_V9_H_
+#define NET_V9_H_
 
 #define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
-#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V9
 typedef struct {
   int ndevs;
   int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
 } ncclNetVDeviceProps_v9_t;
-typedef ncclNetVDeviceProps_v9_t ncclNetVDeviceProps_t;
 
 typedef struct {
   char* name;                      // Used mostly for logging.
@@ -35,8 +31,6 @@ typedef struct {
   size_t maxCollBytes;             // Max transfer size for collective operations
 } ncclNetProperties_v9_t;
 
-typedef ncclNetProperties_v9_t ncclNetProperties_t;
-
 typedef struct {
   // Name of the network (mainly for logs)
   const char* name;
@@ -93,7 +87,7 @@ typedef struct {
 
   // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
   // what index this new vNIC exists at
-  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v9_t* props);
 } ncclNet_v9_t;
 
 #endif // end include guard