Skip to content

Commit a89cf07

Browse files
jynvAddyLaddy
authored andcommitted
Perftests: Introduce NCCL_TESTS_SPLIT env
`NCCL_TESTS_SPLIT` serves as new way of computing the color for splitting communicators. Will be overrided by `NCCL_TESTS_SPLIT_MASK`. Examples: NCCL_TESTS_SPLIT_MASK="0x7" # color = rank & 0x7. What we do today to run on a DGX with one GPU per node. NCCL_TESTS_SPLIT="AND 0x7" # color = rank & 0x7. New way to run on one GPU per node on a DGX, equivalent to NCCL_TESTS_SPLIT_MASK=0x7 NCCL_TESTS_SPLIT="MOD 72" # color = rank % 72. One GPU per NVLink domain on an NVL72 system. NCCL_TESTS_SPLIT="DIV 72" # color = rank / 72. Intra NVLink domain on NVL72. You can also use: "%" "&" "|" "/" for short. Extra spaces in the middle will be automatically ignored. Not case sensitive. The followings are all equivalent: NCCL_TESTS_SPLIT="%0x7" NCCL_TESTS_SPLIT="%0b111" NCCL_TESTS_SPLIT="AND 7" NCCL_TESTS_SPLIT="and 0x7"
1 parent cb6a46f commit a89cf07

File tree

1 file changed

+48
-3
lines changed

1 file changed

+48
-3
lines changed

‎src/common.cu‎

Lines changed: 48 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
#include <type_traits>
1111
#include <getopt.h>
1212
#include <libgen.h>
13+
#include <string.h>
14+
#include <ctype.h>
1315
#include "cuda.h"
1416

1517
#include "../verifiable/verifiable.h"
@@ -892,6 +894,26 @@ int main(int argc, char* argv[]) {
892894
return 0;
893895
}
894896

897+
#ifdef MPI_SUPPORT
898+
// parse int for base 2/10/16, will ignore first whitespaces
899+
static bool parseInt(char *s, int *num) {
900+
char *p = NULL;
901+
if (!s || !num)
902+
return false;
903+
while (*s && isspace(*s)) ++s;
904+
if (!*s) return false;
905+
906+
if (strncasecmp(s, "0b", 2) == 0)
907+
*num = (int)strtoul(s + 2, &p, 2);
908+
else
909+
*num = (int)strtoul(s, &p, 0);
910+
911+
if (p == s)
912+
return false;
913+
return true;
914+
}
915+
#endif
916+
895917
testResult_t run() {
896918
int totalProcs = 1, proc = 0, ncclProcs = 1, ncclProc = 0, color = 0;
897919
int localRank = 0;
@@ -909,10 +931,33 @@ testResult_t run() {
909931
if (hostHashs[p] == hostHashs[proc]) localRank++;
910932
}
911933

912-
char* str = getenv("NCCL_TESTS_SPLIT_MASK");
913-
uint64_t mask = str ? strtoul(str, NULL, 16) : 0;
934+
char *splitMaskEnv = NULL;
935+
if (splitMaskEnv = getenv("NCCL_TESTS_SPLIT_MASK")) {
936+
color = proc & strtoul(splitMaskEnv, NULL, 16);
937+
} else if (splitMaskEnv = getenv("NCCL_TESTS_SPLIT")) {
938+
if (
939+
(strncasecmp(splitMaskEnv, "AND", strlen("AND")) == 0 && parseInt(splitMaskEnv + strlen("AND"), &color)) ||
940+
(strncasecmp(splitMaskEnv, "&", strlen("&")) == 0 && parseInt(splitMaskEnv + strlen("&"), &color))
941+
)
942+
color = proc & color;
943+
if (
944+
(strncasecmp(splitMaskEnv, "OR", strlen("OR")) == 0 && parseInt(splitMaskEnv + strlen("OR"), &color)) ||
945+
(strncasecmp(splitMaskEnv, "|", strlen("|")) == 0 && parseInt(splitMaskEnv + strlen("|"), &color))
946+
)
947+
color = proc | color;
948+
if (
949+
(strncasecmp(splitMaskEnv, "MOD", strlen("MOD")) == 0 && parseInt(splitMaskEnv + strlen("MOD"), &color)) ||
950+
(strncasecmp(splitMaskEnv, "%", strlen("%")) == 0 && parseInt(splitMaskEnv + strlen("%"), &color))
951+
)
952+
color = proc % color;
953+
if (
954+
(strncasecmp(splitMaskEnv, "DIV", strlen("DIV")) == 0 && parseInt(splitMaskEnv + strlen("DIV"), &color)) ||
955+
(strncasecmp(splitMaskEnv, "/", strlen("/")) == 0 && parseInt(splitMaskEnv + strlen("/"), &color))
956+
)
957+
color = proc / color;
958+
}
959+
914960
MPI_Comm mpi_comm;
915-
color = proc & mask;
916961
MPI_Comm_split(MPI_COMM_WORLD, color, proc, &mpi_comm);
917962
MPI_Comm_size(mpi_comm, &ncclProcs);
918963
MPI_Comm_rank(mpi_comm, &ncclProc);

0 commit comments

Comments
 (0)