Skip to content

Commit 52aee69

Browse files
authored
Merge pull request ROCm#86 from AtlantaPepsi/UBR_merge
Registered Buffer option from nccl-tests merged
2 parents e635e9c + 71355df commit 52aee69

File tree

2 files changed

+59
-7
lines changed

2 files changed

+59
-7
lines changed

‎README.md‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ $ make HIP_HOME=/path/to/hip NCCL_HOME=/path/to/rccl CUSTOM_RCCL_LIB=/path/to/rc
1515
RCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If you want to compile the tests with MPI support, you need to set MPI=1 and set MPI\_HOME to the path where MPI is installed.
1616

1717
```shell
18-
$ make MPI=1 MPI_HOME=/path/to/mpi HIP_HOME=/path/to/hip RCCL_HOME=/path/to/rccl
18+
$ make MPI=1 MPI_HOME=/path/to/mpi HIP_HOME=/path/to/hip NCCL_HOME=/path/to/rccl
1919
```
2020

2121
RCCL tests can also be built using cmake. A typical sequence will be:

‎src/common.cu‎

Lines changed: 58 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,9 @@ static int delay_inout_place = 0;
107107
static int enable_out_of_place = 1;
108108
static int enable_cache_flush = 0;
109109
static int enable_rotating_tensor = 0;
110+
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
111+
static int local_register = 0;
112+
#endif
110113

111114
#define NUM_BLOCKS 32
112115

@@ -840,10 +843,22 @@ testResult_t threadInit(struct threadArgs* args) {
840843
NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
841844
}
842845
NCCLCHECK(ncclGroupEnd());
846+
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
847+
void **sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*args->nGpus) : NULL;
848+
void **recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*args->nGpus) : NULL;
849+
for (int i=0; i<args->nGpus; i++) {
850+
if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->sendbuffs[i], args->maxbytes, &sendRegHandles[i]));
851+
if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->recvbuffs[i], args->maxbytes, &recvRegHandles[i]));
852+
}
853+
#endif
843854

844855
TESTCHECK(threadRunTests(args));
845856

846857
for (int i=0; i<args->nGpus; i++) {
858+
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
859+
if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], sendRegHandles[i]));
860+
if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], recvRegHandles[i]));
861+
#endif
847862
NCCLCHECK(ncclCommDestroy(args->comms[i]));
848863
}
849864
return testSuccess;
@@ -951,15 +966,16 @@ int main(int argc, char* argv[]) {
951966
{"average", required_argument, 0, 'a'},
952967
{"out_of_place", required_argument, 0, 'O'},
953968
{"cache_flush", required_argument, 0, 'F'},
954-
{"rotating_tensor", required_argument, 0, 'R'},
969+
{"rotating_tensor", required_argument, 0, 'E'},
970+
{"local_register", required_argument, 0, 'R'},
955971
{"help", no_argument, 0, 'h'},
956972
{}
957973
};
958974

959975
while(1) {
960976
int c;
961977

962-
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:O:F:R:a:y:s:u:h:q:", longopts, &longindex);
978+
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:O:F:E:R:a:y:s:u:h:q:", longopts, &longindex);
963979

964980
if (c == -1)
965981
break;
@@ -1067,14 +1083,22 @@ int main(int argc, char* argv[]) {
10671083
gpu_block3 = deviceProps.multiProcessorCount * 60;
10681084
}
10691085
break;
1070-
case 'R':
1086+
case 'E':
10711087
enable_rotating_tensor = strtol(optarg, NULL, 0);
10721088
break;
10731089
case 'a':
10741090
average = (int)strtol(optarg, NULL, 0);
10751091
break;
10761092
case 'q':
10771093
delay_inout_place = (int)strtol(optarg, NULL, 10);
1094+
case 'R':
1095+
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
1096+
if ((int)strtol(optarg, NULL, 0)) {
1097+
local_register = 1;
1098+
}
1099+
#else
1100+
printf("Option -R (register) is not supported before NCCL 2.19. Ignoring\n");
1101+
#endif
10781102
break;
10791103
case 'h':
10801104
default:
@@ -1109,10 +1133,11 @@ int main(int argc, char* argv[]) {
11091133
"[-G,--cudagraph <num graph launches>] \n\t"
11101134
"[-C,--report_cputime <0/1>] \n\t"
11111135
"[-O,--out_of_place <0/1>] \n\t"
1112-
"[-F,--cache_flush <number of iterations between instruction cache flush>] \n\t"
1113-
"[-R,--rotating_tensor <0/1>] \n\t"
1136+
"[-F,--cache_flush <number of iterations between instruction cache flush>] \n\t"
1137+
"[-E,--rotating_tensor <0/1>] \n\t"
11141138
"[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
11151139
"[-q,--delay <delay between out-of-place and in-place in microseconds>] \n\t"
1140+
"[-R,--local_register <1/0> enable local buffer registration on send/recv buffers (default: disable)] \n\t"
11161141
"[-h,--help]\n",
11171142
basename(argv[0]));
11181143
return 0;
@@ -1256,6 +1281,10 @@ testResult_t run() {
12561281

12571282
//if parallel init is not selected, use main thread to initialize NCCL
12581283
ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus);
1284+
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
1285+
void **sendRegHandles = NULL;
1286+
void **recvRegHandles = NULL;
1287+
#endif
12591288
if (!parallel_init) {
12601289
if (ncclProcs == 1) {
12611290
NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpus));
@@ -1267,6 +1296,14 @@ testResult_t run() {
12671296
}
12681297
NCCLCHECK(ncclGroupEnd());
12691298
}
1299+
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
1300+
sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*nThreads*nGpus) : NULL;
1301+
recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*nThreads*nGpus) : NULL;
1302+
for (int i=0; i<nGpus*nThreads; i++) {
1303+
if (local_register) NCCLCHECK(ncclCommRegister(comms[i], sendbuffs[i], sendBytes, &sendRegHandles[i]));
1304+
if (local_register) NCCLCHECK(ncclCommRegister(comms[i], recvbuffs[i], recvBytes, &recvRegHandles[i]));
1305+
}
1306+
#endif
12701307
}
12711308

12721309
int errors[nThreads];
@@ -1352,18 +1389,33 @@ testResult_t run() {
13521389
#endif
13531390

13541391
if (!parallel_init) {
1355-
for(int i=0; i<nGpus*nThreads; ++i)
1392+
for(int i=0; i<nGpus*nThreads; ++i) {
1393+
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
1394+
if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], sendRegHandles[i]));
1395+
if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], recvRegHandles[i]));
1396+
#endif
13561397
NCCLCHECK(ncclCommDestroy(comms[i]));
1398+
}
13571399
free(comms);
13581400
}
13591401

13601402
// Free off CUDA allocated memory
13611403
for (int i=0; i<nGpus*nThreads; i++) {
1404+
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
1405+
if (sendbuffs[i]) NCCLCHECK(ncclMemFree((char*)sendbuffs[i]));
1406+
if (recvbuffs[i]) NCCLCHECK(ncclMemFree((char*)recvbuffs[i]));
1407+
if (datacheck) NCCLCHECK(ncclMemFree(expected[i]));
1408+
#else
13621409
if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
13631410
if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
13641411
if (datacheck) CUDACHECK(cudaFree(expected[i]));
1412+
#endif
13651413
}
13661414
CUDACHECK(cudaFreeHost(delta));
1415+
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
1416+
free(sendRegHandles);
1417+
free(recvRegHandles);
1418+
#endif
13671419

13681420
envstr = getenv("NCCL_TESTS_MIN_BW");
13691421
double check_avg_bw = envstr ? atof(envstr) : -1;

0 commit comments

Comments
 (0)