Skip to content

Commit 0c94d4d

Browse files
authored
Enable viewing algo/proto/channels used in rccl-tests output (ROCm#151)
* Enable algo/proto/channel viewing * Use dynamic symbol loading to avoid build/runtime issues with non-compatible RCCL versions * Reduce code duplication
1 parent e1b8a3a commit 0c94d4d

File tree

14 files changed

+193
-40
lines changed

14 files changed

+193
-40
lines changed

‎README.md‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ All tests support the same set of arguments :
148148
* Parsing RCCL-Tests output
149149
* `-Z,--output_format <csv|json>` Parse RCCL-Tests output as a CSV or JSON. Default : disabled.
150150
* `-x,--output_file <output file name>` RCCL-Tests output file name. Default : disabled.
151+
* `-M,--output_algo_proto_channels <0/1>` Report Algorithm/Protocol/Channels for each message size. Default : 0.
151152

152153
### Running multiple operations in parallel
153154

‎src/all_gather.cu‎

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
#include "cuda_runtime.h"
99
#include "common.h"
10+
#include "rccl_compat.h"
1011

1112
void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
1213
size_t base = (count/nranks) & -(16/eltSize);
@@ -36,6 +37,13 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc
3637
return testSuccess;
3738
}
3839

40+
testResult_t AllGatherGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) {
41+
if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
42+
NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFunc_t::ncclFuncAllGather , count, type , 0, 0, 1, algo, proto, nchannels));
43+
return testSuccess;
44+
}
45+
46+
3947
void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
4048
double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec;
4149

@@ -54,7 +62,8 @@ struct testColl allGatherTest = {
5462
AllGatherGetCollByteCount,
5563
AllGatherInitData,
5664
AllGatherGetBw,
57-
AllGatherRunColl
65+
AllGatherRunColl,
66+
AllGatherGetAlgoProtoChannels
5867
};
5968

6069
void AllGatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {

‎src/all_reduce.cu‎

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
#include "cuda_runtime.h"
99
#include "common.h"
10+
#include "rccl_compat.h"
1011

1112
void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
1213
*sendcount = count;
@@ -33,6 +34,12 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
3334
return testSuccess;
3435
}
3536

37+
testResult_t AllReduceGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) {
38+
if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
39+
NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncAllReduce , count, type , 0, 0, 1, algo, proto, nchannels));
40+
return testSuccess;
41+
}
42+
3643
void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
3744
double baseBw = (double)(count * typesize) / 1.0E9 / sec;
3845

@@ -51,7 +58,8 @@ struct testColl allReduceTest = {
5158
AllReduceGetCollByteCount,
5259
AllReduceInitData,
5360
AllReduceGetBw,
54-
AllReduceRunColl
61+
AllReduceRunColl,
62+
AllReduceGetAlgoProtoChannels
5563
};
5664

5765
void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {

‎src/alltoall.cu‎

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
#include "cuda_runtime.h"
99
#include "common.h"
10+
#include "rccl_compat.h"
1011

1112
void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
1213
*paramcount = (count/nranks) & -(16/eltSize);
@@ -56,7 +57,8 @@ struct testColl alltoAllTest = {
5657
AlltoAllGetCollByteCount,
5758
AlltoAllInitData,
5859
AlltoAllGetBw,
59-
AlltoAllRunColl
60+
AlltoAllRunColl,
61+
NULL
6062
};
6163

6264
void AlltoAllGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {

‎src/alltoallv.cu‎

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
#include "cuda_runtime.h"
99
#include "common.h"
10+
#include "rccl_compat.h"
1011

1112
#define USE_RCCL_GATHER_SCATTER
1213

@@ -156,7 +157,8 @@ struct testColl alltoAllTest = {
156157
AlltoAllvGetCollByteCount,
157158
AlltoAllvInitData,
158159
AlltoAllvGetBw,
159-
AlltoAllvRunColl
160+
AlltoAllvRunColl,
161+
NULL
160162
};
161163

162164
void AlltoAllvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {

‎src/broadcast.cu‎

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
#include "cuda_runtime.h"
99
#include "common.h"
10+
#include "rccl_compat.h"
1011

1112
void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
1213
*sendcount = count;
@@ -32,6 +33,12 @@ testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncc
3233
return testSuccess;
3334
}
3435

36+
testResult_t BroadcastGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) {
37+
if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
38+
NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncBroadcast , count, type , 0, 0, 1, algo, proto, nchannels));
39+
return testSuccess;
40+
}
41+
3542
void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
3643
double baseBw = (double)(count * typesize) / 1.0E9 / sec;
3744

@@ -60,7 +67,8 @@ struct testColl broadcastTest = {
6067
BroadcastGetCollByteCount,
6168
BroadcastInitData,
6269
BroadcastGetBw,
63-
BroadcastRunColl
70+
BroadcastRunColl,
71+
BroadcastGetAlgoProtoChannels
6472
};
6573

6674
void BroadcastGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {

‎src/common.cu‎

Lines changed: 78 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
#include <vector>
2323
#include <utility>
2424
#include <errno.h> /* program_invocation_short_name */
25-
25+
#include <dlfcn.h>
2626
//#define DEBUG_PRINT
2727

2828
#include "verifiable.h"
@@ -35,6 +35,24 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion()
3535
int32_t gpu_block3;
3636
size_t cache_bytes = 192 * 1024 * 1024; // Use 192MB
3737

38+
rcclTestsGetAlgoInfo_t rcclTestsGetAlgoInfo = NULL;
39+
rcclTestsGetProtocolName_t rcclTestsGetProtocolName = NULL;
40+
rcclTestsGetAlgoName_t rcclTestsGetAlgoName= NULL;
41+
static void loadRcclSyms() {
42+
static void* handle = NULL;
43+
const char* libname = "librccl.so";
44+
if (!handle) {
45+
handle = dlopen(libname, RTLD_LAZY | RTLD_LOCAL);
46+
if (!handle) {
47+
fprintf(stderr, "dlopen failed: %s\n", dlerror());
48+
return;
49+
}
50+
}
51+
rcclTestsGetAlgoInfo = (rcclTestsGetAlgoInfo_t) dlsym(handle, "rcclGetAlgoInfo");
52+
rcclTestsGetAlgoName = (rcclTestsGetAlgoName_t) dlsym(handle, "rcclGetAlgoName");
53+
rcclTestsGetProtocolName = (rcclTestsGetProtocolName_t) dlsym(handle, "rcclGetProtocolName");
54+
}
55+
3856
// RCCL_FLOAT8 support
3957
bool rccl_float8_useFnuz = false;
4058
bool IsArchMatch(char const* arch, char const* target) {
@@ -109,6 +127,7 @@ static int nccltype = ncclFloat;
109127
static int ncclroot = 0;
110128
static int parallel_init = 0;
111129
static int blocking_coll = 0;
130+
static int output_algo_proto_channels = 0;
112131
static int memorytype = 0;
113132
static uint32_t cumask[4];
114133
static int streamnull = 0;
@@ -944,8 +963,21 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
944963
TESTCHECK(BenchTime(args, type, op, root, 0));
945964
usleep(delay_inout_place);
946965
}
947-
if (enable_in_place)
966+
if (enable_in_place)
948967
TESTCHECK(BenchTime(args, type, op, root, 1));
968+
if(output_algo_proto_channels) {
969+
if(args->collTest->getAlgoProtoChannels) {
970+
int algo, proto, nchannels;
971+
const char* algoName = NULL;
972+
const char* protoName = NULL;
973+
TESTCHECK(args->collTest->getAlgoProtoChannels(args->comms[0], args->nbytes / wordSize(type), type, &algo, &proto, &nchannels));
974+
NCCLCHECK(rcclTestsGetAlgoName(algo, &algoName));
975+
NCCLCHECK(rcclTestsGetProtocolName(proto, &protoName));
976+
PRINT("%8s %8s %10d", algoName, protoName, nchannels);
977+
} else {
978+
PRINT("%8s %8s %10s","N/A", "N/A", "N/A");
979+
}
980+
}
949981
PRINT("\n");
950982
}
951983
--repeat;
@@ -1108,7 +1140,7 @@ int main(int argc, char* argv[]) {
11081140
}
11091141
#endif
11101142
#endif
1111-
1143+
loadRcclSyms();
11121144
// Parse args
11131145
double parsed;
11141146
int longindex;
@@ -1135,22 +1167,23 @@ int main(int argc, char* argv[]) {
11351167
{"report_cputime", required_argument, 0, 'C'},
11361168
{"average", required_argument, 0, 'a'},
11371169
{"local_register", required_argument, 0, 'R'},
1138-
{"memory_type", required_argument, 0, 'y'}, //RCCL
1139-
{"cumask", required_argument, 0, 'u'}, //RCCL
1140-
{"out_of_place", required_argument, 0, 'O'}, //RCCL
1141-
{"delay_inout_place", required_argument, 0, 'q'}, //RCCL
1142-
{"cache_flush", required_argument, 0, 'F'}, //RCCL
1143-
{"rotating_tensor", required_argument, 0, 'E'}, //RCCL
1144-
{"output_file", required_argument, 0, 'x'}, //RCCL
1145-
{"output_format", required_argument, 0, 'Z'}, //RCCL
1170+
{"memory_type", required_argument, 0, 'y'}, //RCCL
1171+
{"cumask", required_argument, 0, 'u'}, //RCCL
1172+
{"out_of_place", required_argument, 0, 'O'}, //RCCL
1173+
{"delay_inout_place", required_argument, 0, 'q'}, //RCCL
1174+
{"cache_flush", required_argument, 0, 'F'}, //RCCL
1175+
{"rotating_tensor", required_argument, 0, 'E'}, //RCCL
1176+
{"output_file", required_argument, 0, 'x'}, //RCCL
1177+
{"output_format", required_argument, 0, 'Z'}, //RCCL
1178+
{"output_algo_proto_channels", required_argument, 0, 'M'}, //RCCL
11461179
{"help", no_argument, 0, 'h'},
11471180
{}
11481181
};
11491182

11501183
while(1) {
11511184
int c;
11521185

1153-
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:G:C:a:R:Y:u:O:q:F:E:x:Z:h", longopts, &longindex);
1186+
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:G:C:a:R:Y:u:O:q:F:E:x:Z:M:h", longopts, &longindex);
11541187

11551188
if (c == -1)
11561189
break;
@@ -1290,6 +1323,10 @@ int main(int argc, char* argv[]) {
12901323
case 'Z':
12911324
output_format = optarg;
12921325
break;
1326+
case 'M':
1327+
output_algo_proto_channels = strtol(optarg, NULL, 0);
1328+
if(rcclTestsGetAlgoInfo == NULL || rcclTestsGetAlgoName == NULL || rcclTestsGetProtocolName == NULL) output_algo_proto_channels = 0;
1329+
break;
12931330
case 'h':
12941331
default:
12951332
if (c != 'h') printf("invalid option '%c'\n", c);
@@ -1607,27 +1644,39 @@ testResult_t run() {
16071644
}
16081645

16091646
fflush(stdout);
1610-
1647+
const char* extra_col_str[3] = {"", "", ""};
1648+
if (output_algo_proto_channels) {
1649+
extra_col_str[0] = "algo";
1650+
extra_col_str[1] = "proto";
1651+
extra_col_str[2] = "nchannels";
1652+
}
1653+
const char* header_col_str[3] = {" out-of-place in-place ",
1654+
" out-of-place "," in-place "};
1655+
int header_index =(enable_out_of_place && enable_in_place) ? 0 : (enable_out_of_place ? 1 : 2);
16111656
const char* timeStr = report_cputime ? "cputime" : "time";
1657+
16121658
PRINT("#\n");
1659+
PRINT("# %10s %12s %8s %6s %6s%s\n", "", "", "", "", "", header_col_str[header_index]);
16131660
if (enable_out_of_place && enable_in_place) {
1614-
PRINT("# %10s %12s %8s %6s %6s out-of-place in-place \n", "", "", "", "", "");
1615-
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %6s %7s %6s %6s %6s\n", "size", "count", "type", "redop", "root",
1616-
timeStr, "algbw", "busbw", "#wrong", timeStr, "algbw", "busbw", "#wrong");
1617-
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "",
1618-
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
1619-
} else if (enable_out_of_place) {
1620-
PRINT("# %10s %12s %8s %6s %6s out-of-place \n", "", "", "", "", "");
1621-
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %6s\n", "size", "count", "type", "redop", "root",
1622-
timeStr, "algbw", "busbw", "#wrong");
1623-
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "",
1624-
"(us)", "(GB/s)", "(GB/s)", "");
1661+
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %6s %7s %6s %6s %6s %8s %8s %10s\n",
1662+
"size", "count", "type", "redop", "root",
1663+
timeStr, "algbw", "busbw", "#wrong",
1664+
timeStr, "algbw", "busbw", "#wrong",
1665+
extra_col_str[0], extra_col_str[1], extra_col_str[2]);
1666+
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s %8s %8s %10s\n",
1667+
"(B)", "(elements)", "", "", "",
1668+
"(us)", "(GB/s)", "(GB/s)", "",
1669+
"(us)", "(GB/s)", "(GB/s)", "",
1670+
"", "", "");
16251671
} else {
1626-
PRINT("# %10s %12s %8s %6s %6s in-place \n", "", "", "", "", "");
1627-
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %6s\n", "size", "count", "type", "redop", "root",
1628-
timeStr, "algbw", "busbw", "#wrong");
1629-
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "",
1630-
"(us)", "(GB/s)", "(GB/s)", "");
1672+
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %6s %8s %8s %10s\n",
1673+
"size", "count", "type", "redop", "root",
1674+
timeStr, "algbw", "busbw", "#wrong",
1675+
extra_col_str[0], extra_col_str[1], extra_col_str[2]);
1676+
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %8s %8s %10s\n",
1677+
"(B)", "(elements)", "", "", "",
1678+
"(us)", "(GB/s)", "(GB/s)", "",
1679+
"", "", "");
16311680
}
16321681
Reporter reporter(output_file, output_format);
16331682

‎src/common.h‎

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
************************************************************************/
88
#ifndef __COMMON_H__
99
#define __COMMON_H__
10-
1110
#include "rccl/rccl.h"
1211
#include <stdio.h>
1312
#include <cstdint>
@@ -107,6 +106,7 @@ struct testColl {
107106
void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
108107
testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
109108
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
109+
testResult_t (*getAlgoProtoChannels)(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels);
110110
};
111111
extern struct testColl allReduceTest;
112112
extern struct testColl allGatherTest;
@@ -375,4 +375,24 @@ extern int is_main_proc;
375375
extern thread_local int is_main_thread;
376376
#define PRINT if (is_main_thread) printf
377377

378+
typedef enum {
379+
ncclFuncBroadcast = 0,
380+
ncclFuncReduce = 1,
381+
ncclFuncAllGather = 2,
382+
ncclFuncReduceScatter = 3,
383+
ncclFuncAllReduce = 4,
384+
ncclFuncAllReduceWithBias = 5,
385+
ncclFuncSendRecv = 6,
386+
ncclFuncSend = 7,
387+
ncclFuncRecv = 8,
388+
ncclFuncAllToAllPivot = 9,
389+
ncclNumFuncs = 10
390+
} ncclFunc_t;
391+
392+
typedef ncclResult_t (*rcclTestsGetAlgoInfo_t)(struct ncclComm* comm, ncclFunc_t coll, uint64_t count, ncclDataType_t dataType,
393+
int collNetSupport, int nvlsSupport, int numPipeOps,
394+
int* algo, int* protocol, int* maxChannels);
395+
typedef ncclResult_t (*rcclTestsGetAlgoName_t)(int algo, const char** algoName);
396+
typedef ncclResult_t (*rcclTestsGetProtocolName_t)(int protocol, const char** protocolName);
397+
378398
#endif

‎src/gather.cu‎

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
#include "cuda_runtime.h"
99
#include "common.h"
10+
#include "rccl_compat.h"
1011

1112
void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
1213
*sendcount = (count/nranks) & -(16/eltSize);
@@ -69,7 +70,8 @@ struct testColl gatherTest = {
6970
GatherGetCollByteCount,
7071
GatherInitData,
7172
GatherGetBw,
72-
GatherRunColl
73+
GatherRunColl,
74+
NULL
7375
};
7476

7577
void GatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {

0 commit comments

Comments
 (0)