Skip to content

Commit 50a2663

Browse files
Merge remote-tracking branch 'nccl-tests/master' into develop
2 parents 2c255c4 + 97ee098 commit 50a2663

File tree

10 files changed

+278
-63
lines changed

10 files changed

+278
-63
lines changed

���README.md‎

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ These tests check both the performance and the correctness of RCCL operations. T
44

55
## Build
66

7-
To build the tests, just type `make`.
7+
To build the tests, just type `make` or `make -j`
88

99
If HIP is not installed in `/opt/rocm`, you may specify `HIP_HOME`. Similarly, if RCCL (`librccl.so`) is not installed in `/opt/rocm/lib/`, you may specify `NCCL_HOME` and `CUSTOM_RCCL_LIB`.
1010

@@ -75,12 +75,14 @@ RCCL Tests can run on multiple processes, multiple threads, and multiple HIP dev
7575
### Quick examples
7676

7777
Run on single node with 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
78+
7879
```shell
7980
$ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
8081
```
8182

8283
Run 64 MPI processes on nodes with 8 GPUs each, for a total of 64 GPUs spread across 8 nodes :
8384
(NB: The rccl-tests binaries must be compiled with `MPI=1` for this case)
85+
8486
```shell
8587
$ mpirun -np 64 -N 8 ./build/all_reduce_perf -b 8 -e 8G -f 2 -g 1
8688
```
@@ -138,8 +140,8 @@ All tests support the same set of arguments :
138140
* `-z,--blocking <0/1>` Make RCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
139141
* `-G,--hipgraph <num graph launches>` Capture iterations as a HIP graph and then replay specified number of times. Default : 0.
140142
* `-C,--report_cputime <0/1>]` Report CPU time instead of latency. Default : 0.
141-
* `-R,--local_register <1/0>` enable local buffer registration on send/recv buffers. Default : 0.
142-
* `-T,--timeout <time in seconds>` timeout each test after specified number of seconds. Default: disabled.
143+
* `-R,--local_register <0/1/2>` enable local (1) or symmetric (2) buffer registration on send/recv buffers. Default : 0.
144+
* `-T,--timeout <time in seconds>` timeout each test after specified number of seconds. Default : disabled.
143145
* `-F,--cache_flush <cache flush after every -F iteration>` Enable cache flush after every -F iteration. Default : 0 (No cache flush).
144146
* `-O,--out_of_place <0=in-place only, 1=out-of-place only>`. Default: both.
145147
* `-q,--delay <delay>` Delay between out-of-place and in-place runs (in microseconds). Default: 10.
@@ -158,9 +160,12 @@ with the same color will end up in the same group. The resulting group is printe
158160
`NCCL_TESTS_SPLIT_MASK="<value>"` is equivalent to `NCCL_TESTS_SPLIT="&<value>"`.
159161

160162
Here are a few examples:
161-
- `NCCL_TESTS_SPLIT="AND 0x7"` or `NCCL_TESTS_SPLIT="MOD 8`: On systems with 8 GPUs, run 8 parallel operations, each with 1 GPU per node (purely communicating on the network)
162-
- `NCCL_TESTS_SPLIT="OR 0x7"` or `NCCL_TESTS_SPLIT="DIV 8"`: On systems with 8 GPUs, run one operation per node, purely intra-node.
163-
- `NCCL_TESTS_SPLIT="AND 0x1"` or `NCCL_TESTS_SPLIT="MOD 2"`: Run two operations, each operation using every other rank.
163+
164+
- `NCCL_TESTS_SPLIT="AND 0x7"` or `NCCL_TESTS_SPLIT="MOD 8"`: On systems with 8 GPUs, run 8 parallel operations, each with 1 GPU per node (purely communicating over the inter-node network)
165+
166+
- `NCCL_TESTS_SPLIT="OR 0x7"` or `NCCL_TESTS_SPLIT="DIV 8"`: On systems with 8 GPUs, run one operation per node, purely intra-node.
167+
168+
- `NCCL_TESTS_SPLIT="AND 0x1"` or `NCCL_TESTS_SPLIT="MOD 2"`: Run two operations, each operation using every other rank.
164169

165170
Note that the reported bandwidth is per group, hence to get the total bandwidth used by all groups, one must multiply by the number of groups.
166171

@@ -178,6 +183,6 @@ $ LD_LIBRARY_PATH=/path/to/rccl-install/lib/ HSA_FORCE_FINE_GRAIN_PCIE=1 python3
178183

179184
## Copyright
180185

181-
NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
186+
NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
182187

183188
All modifications are copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.

‎src/Makefile‎

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
#
2-
# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
2+
# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
33
# Modifications are Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
44
#
55
# See LICENSE.txt for license information
66
#
7+
include common.mk
78

89
ROCM_PATH ?= /opt/rocm
910
MPI_HOME ?= /usr/lib/x86_64-linux-gnu
@@ -21,6 +22,10 @@ HIPCUFLAGS := -std=c++14
2122
LDFLAGS :=
2223
HIPLDFLAGS :=
2324

25+
MPI ?= 0 # Set to 1 to enable MPI support (multi-process/multi-node)
26+
NAME_SUFFIX ?= # e.g. _mpi when using MPI=1
27+
DSO ?= 0 # Set to 1 to create and use libverifiable.so to reduce binary size
28+
2429
HIP_VERSION = $(strip $(shell which $(HIPCONFIG) >/dev/null && $(HIPCONFIG) --version))
2530
HIP_MAJOR = $(shell echo $(HIP_VERSION) | cut -d "." -f 1)
2631
HIP_MINOR = $(shell echo $(HIP_VERSION) | cut -d "." -f 2)
@@ -126,7 +131,7 @@ DST_DIR := $(BUILDDIR)
126131
SRC_FILES := $(wildcard *.cu)
127132
OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
128133
BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv alltoallv hypercube
129-
BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
134+
BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf${NAME_SUFFIX})
130135

131136
GIT_VERSION_FILE := ${DST_DIR}/src/git_version.cpp
132137
GIT_REV := $(shell git log --pretty=format:'%h' -n 1)
@@ -157,20 +162,39 @@ ${HIPIFY_DIR}/%.h: %.h
157162
@mkdir -p ${HIPIFY_DIR}
158163
hipify-perl -quiet-warnings $< > $@
159164

165+
.PRECIOUS: ${DST_DIR}/%.o
166+
160167
${DST_DIR}/%.o: ${HIPIFY_DIR}/%.cu.cpp ${HIPIFY_DIR}/common.h $(TEST_VERIFIABLE_HDRS) $(GIT_VERSION_FILE)
161168
@printf "Compiling %-35s > %s\n" $< $@
162169
@mkdir -p ${DST_DIR}
163170
echo "$(HIPCC) $(HIPCUFLAGS) -I. -c -o $@ $<"
164171
$(HIPCC) $(HIPCUFLAGS) -I. -c -o $@ $<
165172

173+
${DST_DIR}/%$(NAME_SUFFIX).o: %.cu.cpp ${HIPIFY_DIR}/common.h $(TEST_VERIFIABLE_HDRS) $(GIT_VERSION_FILE)
174+
@printf "Compiling %-35s > %s\n" $< $@
175+
@mkdir -p ${DST_DIR}
176+
echo "$(HIPCC) $(HIPCUFLAGS) -I. -c -o $@ $<"
177+
$(HIPCC) $(HIPCUFLAGS) -I. -c -o $@ $<
178+
166179
${DST_DIR}/timer.o: timer.cc timer.h
167180
@printf "Compiling %-35s > %s\n" $< $@
168181
@mkdir -p ${DST_DIR}
169-
$(CXX) $(CXXFLAGS) -o $@ -c timer.cc
182+
$(CXX) $(CXXFLAGS) -o $@ -c $<
170183

171-
${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) $(DST_DIR)/src/git_version.cpp
184+
ifeq ($(DSO), 1)
185+
${DST_DIR}/%_perf$(NAME_SUFFIX): ${DST_DIR}/%.o ${DST_DIR}/common$(NAME_SUFFIX).o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_LIBS) $(DST_DIR)/src/git_version.cpp
186+
@printf "Linking %-35s > %s\n" $< $@
187+
@mkdir -p ${DST_DIR}
188+
echo "$(HIPCC) -o $@ $^ $(HIPLDFLAGS)"
189+
$(HIPCC) -o $@ $^ $(HIPLDFLAGS) -L$(TEST_VERIFIABLE_BUILDDIR) -lverifiable -Xlinker "--enable-new-dtags" -Xlinker "-rpath,\$$ORIGIN:\$$ORIGIN/verifiable"
190+
else
191+
${DST_DIR}/%_perf$(NAME_SUFFIX):${DST_DIR}/%.o ${DST_DIR}/common$(NAME_SUFFIX).o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) $(DST_DIR)/src/git_version.cpp
172192
@printf "Linking %-35s > %s\n" $< $@
173193
@mkdir -p ${DST_DIR}
174194
echo "$(HIPCC) -o $@ $^ $(HIPLDFLAGS)"
175195
$(HIPCC) -o $@ $^ $(HIPLDFLAGS)
196+
endif
197+
198+
clean_intermediates:
199+
rm -f ${DST_DIR}/*.o $(TEST_VERIFIABLE_OBJS)
176200

‎src/common.cu‎

Lines changed: 87 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -41,19 +41,19 @@ bool IsArchMatch(char const* arch, char const* target) {
4141
#if NCCL_MAJOR >= 2
4242
ncclDataType_t test_types[ncclNumTypes] = {
4343
ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble
44-
#if RCCL_BFLOAT16 == 1
44+
#if HAVE_BF16
4545
, ncclBfloat16
4646
#endif
47-
#if RCCL_FLOAT8 == 1
47+
#if HAVE_FP8
4848
, ncclFloat8e4m3, ncclFloat8e5m2
4949
#endif
5050
};
5151
const char *test_typenames[ncclNumTypes] = {
5252
"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"
53-
#if RCCL_BFLOAT16 == 1
53+
#if HAVE_BF16
5454
, "bfloat16"
5555
#endif
56-
#if RCCL_FLOAT8 == 1
56+
#if HAVE_FP8
5757
, "fp8_e4m3", "fp8_e5m2"
5858
#endif
5959
};
@@ -122,8 +122,11 @@ static int enable_in_place = 1;
122122
static int enable_cache_flush = 0;
123123
static int enable_rotating_tensor = 0;
124124
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
125+
#define LOCAL_REGISTER 1
126+
#define SYMMETRIC_REGISTER 2
125127
static int local_register = 0;
126128
#endif
129+
static int minCudaArch = 1<<30;
127130

128131
Reporter::Reporter(std::string fileName, std::string outputFormat) : _outputFormat(outputFormat) {
129132
if (!fileName.empty()) {
@@ -203,7 +206,6 @@ void Reporter::addResult(int gpusPerRank, int ranksPerNode, int totalRanks, size
203206
}
204207

205208
bool Reporter::isMainThread() { return is_main_thread == 1; }
206-
static int minCudaArch = 1<<30;
207209

208210
#define NUM_BLOCKS 32
209211

@@ -550,10 +552,10 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
550552
union {
551553
int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64;
552554
half f16; float f32; double f64;
553-
#if defined(RCCL_BFLOAT16)
555+
#if HAVE_BF16
554556
hip_bfloat16 bf16;
555557
#endif
556-
#if defined(RCCL_FLOAT8)
558+
#if HAVE_FP8
557559
rccl_float8 fp8_e4m3; rccl_bfloat8 fp8_e5m2;
558560
#endif
559561
};
@@ -567,14 +569,14 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
567569
case ncclFloat16: f16 = ncclVerifiablePremulScalar<half>(rank); break;
568570
case ncclFloat32: f32 = ncclVerifiablePremulScalar<float>(rank); break;
569571
case ncclFloat64: f64 = ncclVerifiablePremulScalar<double>(rank); break;
570-
#if defined(RCCL_BFLOAT16)
572+
#if HAVE_BF16
571573
case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<hip_bfloat16>(rank); break;
572574
#endif
573-
#if defined(RCCL_FLOAT8)
575+
#if HAVE_FP8
574576
case ncclFloat8e4m3: fp8_e4m3 = ncclVerifiablePremulScalar<rccl_float8>(rank); break;
575577
case ncclFloat8e5m2 : fp8_e5m2 = ncclVerifiablePremulScalar<rccl_bfloat8>(rank); break;
576578
#endif
577-
case ncclNumTypes: break;
579+
default: break; // Just to silence clang
578580
}
579581
NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
580582
}
@@ -957,20 +959,38 @@ testResult_t threadInit(struct threadArgs* args) {
957959
}
958960
NCCLCHECK(ncclGroupEnd());
959961
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
962+
NCCLCHECK(ncclGroupStart());
960963
void **sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*args->nGpus) : NULL;
961964
void **recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*args->nGpus) : NULL;
962965
for (int i=0; i<args->nGpus; i++) {
963-
if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->sendbuffs[i], args->maxbytes, &sendRegHandles[i]));
964-
if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->recvbuffs[i], args->maxbytes, &recvRegHandles[i]));
966+
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
967+
if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
968+
NCCLCHECK(ncclCommWindowRegister(args->comms[i], args->sendbuffs[i], args->maxbytes, (ncclWindow_t*)&sendRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
969+
NCCLCHECK(ncclCommWindowRegister(args->comms[i], args->recvbuffs[i], args->maxbytes, (ncclWindow_t*)&recvRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
970+
} else
971+
#endif
972+
{
973+
if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->sendbuffs[i], args->maxbytes, &sendRegHandles[i]));
974+
if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->recvbuffs[i], args->maxbytes, &recvRegHandles[i]));
975+
}
965976
}
977+
NCCLCHECK(ncclGroupEnd());
966978
#endif
967979

968980
TESTCHECK(threadRunTests(args));
969981

970982
for (int i=0; i<args->nGpus; i++) {
971983
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
972-
if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], sendRegHandles[i]));
973-
if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], recvRegHandles[i]));
984+
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
985+
if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
986+
NCCLCHECK(ncclCommWindowDeregister(args->comms[i], (ncclWindow_t)sendRegHandles[i]));
987+
NCCLCHECK(ncclCommWindowDeregister(args->comms[i], (ncclWindow_t)recvRegHandles[i]));
988+
} else
989+
#endif
990+
{
991+
if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], sendRegHandles[i]));
992+
if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], recvRegHandles[i]));
993+
}
974994
#endif
975995
NCCLCHECK(ncclCommDestroy(args->comms[i]));
976996
}
@@ -1046,17 +1066,20 @@ int main(int argc, char* argv[]) {
10461066
test_typenum = 9;
10471067
if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
10481068
test_opnum++; // ncclAvg
1049-
#if defined(RCCL_BFLOAT16)
1050-
test_typenum++; // bfloat16
1051-
#endif
1052-
#if defined(RCCL_FLOAT8)
1053-
test_typenum++; // fp8_e4m3
1054-
test_typenum++; // fp8_e5m2
1055-
#endif
10561069
}
10571070
if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) {
10581071
test_opnum++; // PreMulSum
10591072
}
1073+
#if defined(RCCL_BFLOAT16)
1074+
if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
1075+
test_typenum++; // bfloat16
1076+
}
1077+
#endif
1078+
#if defined(RCCL_FLOAT8)
1079+
if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
1080+
test_typenum += 2; // fp8 e4m3,e5m2
1081+
}
1082+
#endif
10601083
#endif
10611084

10621085
// Parse args
@@ -1194,8 +1217,10 @@ int main(int argc, char* argv[]) {
11941217
break;
11951218
case 'R':
11961219
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
1197-
if ((int)strtol(optarg, NULL, 0)) {
1198-
local_register = 1;
1220+
local_register = (int)strtol(optarg, NULL, 0);
1221+
if (local_register == SYMMETRIC_REGISTER && test_ncclVersion < NCCL_VERSION(2,27,0)) {
1222+
printf("Option -R 2 (symmetric) is not supported before NCCL 2.27. Defaulting to local registration\n");
1223+
local_register = LOCAL_REGISTER;
11991224
}
12001225
#else
12011226
printf("Option -R (register) is not supported before NCCL 2.19. Ignoring\n");
@@ -1269,7 +1294,7 @@ int main(int argc, char* argv[]) {
12691294
"[-G,--cudagraph <num graph launches>] \n\t"
12701295
"[-C,--report_cputime <0/1>] \n\t"
12711296
"[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
1272-
"[-R,--local_register <1/0> enable local buffer registration on send/recv buffers (default: disable)] \n\t"
1297+
"[-R,--local_register <0/1/2> enable local (1) or symmetric (2) buffer registration on send/recv buffers (default: disable (0))] \n\t"
12731298
"[-Y,--memory_type <coarse/fine/host/managed>] \n\t"
12741299
"[-u,--cumask <d0,d1,d2,d3>] \n\t"
12751300
"[-O,--out_of_place <0/1>] \n\t"
@@ -1486,6 +1511,22 @@ testResult_t run() {
14861511
#ifdef MPI_SUPPORT
14871512
MPI_Allreduce(MPI_IN_PLACE, &minCudaArch, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
14881513
#endif
1514+
#if defined(RCCL_FLOAT8)
1515+
if (NCCL_VERSION_CODE >= NCCL_VERSION(2,24,0) && test_ncclVersion >= NCCL_VERSION(2,24,0)) {
1516+
if (minCudaArch < 900) { // Filter out fp8 on pre-Hopper hardware
1517+
int n = 0;
1518+
for (int i=0; i < test_typenum; i++) {
1519+
if (!(test_types[i] == ncclFloat8e4m3 || test_types[i] == ncclFloat8e5m2)) {
1520+
test_types[n] = test_types[i];
1521+
test_typenames[n] = test_typenames[i];
1522+
n += 1;
1523+
}
1524+
}
1525+
test_typenum = n;
1526+
}
1527+
}
1528+
#endif
1529+
14891530
//if parallel init is not selected, use main thread to initialize NCCL
14901531
ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus);
14911532
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
@@ -1504,12 +1545,22 @@ testResult_t run() {
15041545
NCCLCHECK(ncclGroupEnd());
15051546
}
15061547
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
1548+
NCCLCHECK(ncclGroupStart());
15071549
sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*nThreads*nGpus) : NULL;
15081550
recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*nThreads*nGpus) : NULL;
15091551
for (int i=0; i<nGpus*nThreads; i++) {
1510-
if (local_register) NCCLCHECK(ncclCommRegister(comms[i], &sendbuffs[i], maxBytes, &sendRegHandles[i]));
1511-
if (local_register) NCCLCHECK(ncclCommRegister(comms[i], &recvbuffs[i], maxBytes, &recvRegHandles[i]));
1552+
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
1553+
if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
1554+
NCCLCHECK(ncclCommWindowRegister(comms[i], sendbuffs[i], maxBytes, (ncclWindow_t*)&sendRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
1555+
NCCLCHECK(ncclCommWindowRegister(comms[i], recvbuffs[i], maxBytes, (ncclWindow_t*)&recvRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
1556+
} else
1557+
#endif
1558+
{
1559+
if (local_register) NCCLCHECK(ncclCommRegister(comms[i], sendbuffs[i], maxBytes, &sendRegHandles[i]));
1560+
if (local_register) NCCLCHECK(ncclCommRegister(comms[i], recvbuffs[i], maxBytes, &recvRegHandles[i]));
1561+
}
15121562
}
1563+
NCCLCHECK(ncclGroupEnd());
15131564
#endif
15141565
}
15151566

@@ -1607,8 +1658,16 @@ testResult_t run() {
16071658
if (!parallel_init) {
16081659
for(int i=0; i<nGpus*nThreads; ++i) {
16091660
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
1610-
if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], sendRegHandles[i]));
1611-
if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], recvRegHandles[i]));
1661+
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
1662+
if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
1663+
NCCLCHECK(ncclCommWindowDeregister(comms[i], (ncclWindow_t)sendRegHandles[i]));
1664+
NCCLCHECK(ncclCommWindowDeregister(comms[i], (ncclWindow_t)recvRegHandles[i]));
1665+
} else
1666+
#endif
1667+
{
1668+
if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], sendRegHandles[i]));
1669+
if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], recvRegHandles[i]));
1670+
}
16121671
#endif
16131672
NCCLCHECK(ncclCommDestroy(comms[i]));
16141673
}

0 commit comments

Comments
 (0)