@@ -107,6 +107,9 @@ static int delay_inout_place = 0;
107107static int enable_out_of_place = 1 ;
108108static int enable_cache_flush = 0 ;
109109static int enable_rotating_tensor = 0 ;
110+ #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
111+ static int local_register = 0 ;
112+ #endif
110113
111114#define NUM_BLOCKS 32
112115
@@ -840,10 +843,22 @@ testResult_t threadInit(struct threadArgs* args) {
840843 NCCLCHECK (ncclCommInitRank (args->comms +i, nranks, args->ncclId , rank));
841844 }
842845 NCCLCHECK (ncclGroupEnd ());
846+ #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
847+ void **sendRegHandles = (local_register) ? (void **)malloc (sizeof (*sendRegHandles)*args->nGpus ) : NULL ;
848+ void **recvRegHandles = (local_register) ? (void **)malloc (sizeof (*recvRegHandles)*args->nGpus ) : NULL ;
849+ for (int i=0 ; i<args->nGpus ; i++) {
850+ if (local_register) NCCLCHECK (ncclCommRegister (args->comms [i], args->sendbuffs [i], args->maxbytes , &sendRegHandles[i]));
851+ if (local_register) NCCLCHECK (ncclCommRegister (args->comms [i], args->recvbuffs [i], args->maxbytes , &recvRegHandles[i]));
852+ }
853+ #endif
843854
844855 TESTCHECK (threadRunTests (args));
845856
846857 for (int i=0 ; i<args->nGpus ; i++) {
858+ #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
859+ if (local_register) NCCLCHECK (ncclCommDeregister (args->comms [i], sendRegHandles[i]));
860+ if (local_register) NCCLCHECK (ncclCommDeregister (args->comms [i], recvRegHandles[i]));
861+ #endif
847862 NCCLCHECK (ncclCommDestroy (args->comms [i]));
848863 }
849864 return testSuccess;
@@ -951,15 +966,16 @@ int main(int argc, char* argv[]) {
951966 {" average" , required_argument, 0 , ' a' },
952967 {" out_of_place" , required_argument, 0 , ' O' },
953968 {" cache_flush" , required_argument, 0 , ' F' },
954- {" rotating_tensor" , required_argument, 0 , ' R' },
969+ {" rotating_tensor" , required_argument, 0 , ' E' },
970+ {" local_register" , required_argument, 0 , ' R' },
955971 {" help" , no_argument, 0 , ' h' },
956972 {}
957973 };
958974
959975 while (1 ) {
960976 int c;
961977
962- c = getopt_long (argc, argv, " t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:O:F:R:a:y:s:u:h:q:" , longopts, &longindex);
978+ c = getopt_long (argc, argv, " t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:O:F:E: R:a:y:s:u:h:q:" , longopts, &longindex);
963979
964980 if (c == -1 )
965981 break ;
@@ -1067,14 +1083,22 @@ int main(int argc, char* argv[]) {
10671083 gpu_block3 = deviceProps.multiProcessorCount * 60 ;
10681084 }
10691085 break ;
1070- case ' R ' :
1086+ case ' E ' :
10711087 enable_rotating_tensor = strtol (optarg, NULL , 0 );
10721088 break ;
10731089 case ' a' :
10741090 average = (int )strtol (optarg, NULL , 0 );
10751091 break ;
10761092 case ' q' :
10771093 delay_inout_place = (int )strtol (optarg, NULL , 10 );
1094+ case ' R' :
1095+ #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
1096+ if ((int )strtol (optarg, NULL , 0 )) {
1097+ local_register = 1 ;
1098+ }
1099+ #else
1100+ printf (" Option -R (register) is not supported before NCCL 2.19. Ignoring\n " );
1101+ #endif
10781102 break ;
10791103 case ' h' :
10801104 default :
@@ -1109,10 +1133,11 @@ int main(int argc, char* argv[]) {
11091133 " [-G,--cudagraph <num graph launches>] \n\t "
11101134 " [-C,--report_cputime <0/1>] \n\t "
11111135 " [-O,--out_of_place <0/1>] \n\t "
1112- " [-F,--cache_flush <number of iterations between instruction cache flush>] \n\t "
1113- " [-R ,--rotating_tensor <0/1>] \n\t "
1136+ " [-F,--cache_flush <number of iterations between instruction cache flush>] \n\t "
1137+ " [-E ,--rotating_tensor <0/1>] \n\t "
11141138 " [-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t "
11151139 " [-q,--delay <delay between out-of-place and in-place in microseconds>] \n\t "
1140+ " [-R,--local_register <1/0> enable local buffer registration on send/recv buffers (default: disable)] \n\t "
11161141 " [-h,--help]\n " ,
11171142 basename (argv[0 ]));
11181143 return 0 ;
@@ -1256,6 +1281,10 @@ testResult_t run() {
12561281
12571282 // if parallel init is not selected, use main thread to initialize NCCL
12581283 ncclComm_t* comms = (ncclComm_t*)malloc (sizeof (ncclComm_t)*nThreads*nGpus);
1284+ #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
1285+ void **sendRegHandles = NULL ;
1286+ void **recvRegHandles = NULL ;
1287+ #endif
12591288 if (!parallel_init) {
12601289 if (ncclProcs == 1 ) {
12611290 NCCLCHECK (ncclCommInitAll (comms, nGpus*nThreads, gpus));
@@ -1267,6 +1296,14 @@ testResult_t run() {
12671296 }
12681297 NCCLCHECK (ncclGroupEnd ());
12691298 }
1299+ #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
1300+ sendRegHandles = (local_register) ? (void **)malloc (sizeof (*sendRegHandles)*nThreads*nGpus) : NULL ;
1301+ recvRegHandles = (local_register) ? (void **)malloc (sizeof (*recvRegHandles)*nThreads*nGpus) : NULL ;
1302+ for (int i=0 ; i<nGpus*nThreads; i++) {
1303+ if (local_register) NCCLCHECK (ncclCommRegister (comms[i], sendbuffs[i], sendBytes, &sendRegHandles[i]));
1304+ if (local_register) NCCLCHECK (ncclCommRegister (comms[i], recvbuffs[i], recvBytes, &recvRegHandles[i]));
1305+ }
1306+ #endif
12701307 }
12711308
12721309 int errors[nThreads];
@@ -1352,18 +1389,33 @@ testResult_t run() {
13521389#endif
13531390
13541391 if (!parallel_init) {
1355- for (int i=0 ; i<nGpus*nThreads; ++i)
1392+ for (int i=0 ; i<nGpus*nThreads; ++i) {
1393+ #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
1394+ if (local_register) NCCLCHECK (ncclCommDeregister (comms[i], sendRegHandles[i]));
1395+ if (local_register) NCCLCHECK (ncclCommDeregister (comms[i], recvRegHandles[i]));
1396+ #endif
13561397 NCCLCHECK (ncclCommDestroy (comms[i]));
1398+ }
13571399 free (comms);
13581400 }
13591401
13601402 // Free off CUDA allocated memory
13611403 for (int i=0 ; i<nGpus*nThreads; i++) {
1404+ #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
1405+ if (sendbuffs[i]) NCCLCHECK (ncclMemFree ((char *)sendbuffs[i]));
1406+ if (recvbuffs[i]) NCCLCHECK (ncclMemFree ((char *)recvbuffs[i]));
1407+ if (datacheck) NCCLCHECK (ncclMemFree (expected[i]));
1408+ #else
13621409 if (sendbuffs[i]) CUDACHECK (cudaFree ((char *)sendbuffs[i]));
13631410 if (recvbuffs[i]) CUDACHECK (cudaFree ((char *)recvbuffs[i]));
13641411 if (datacheck) CUDACHECK (cudaFree (expected[i]));
1412+ #endif
13651413 }
13661414 CUDACHECK (cudaFreeHost (delta));
1415+ #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
1416+ free (sendRegHandles);
1417+ free (recvRegHandles);
1418+ #endif
13671419
13681420 envstr = getenv (" NCCL_TESTS_MIN_BW" );
13691421 double check_avg_bw = envstr ? atof (envstr) : -1 ;
0 commit comments