Skip to content

Commit 48f22de

Browse files
authored
Merge pull request ROCm#57 from nusislam/out-of-place
Add option to disable out-of-place runs
2 parents b1f86ea + a2bec5d commit 48f22de

File tree

2 files changed

+27
-10
lines changed

2 files changed

+27
-10
lines changed

‎src/common.cu‎

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ static int numDevices = 1;
9494
static int ranksPerGpu = 1;
9595
static int enable_multiranks = 0;
9696
static int delay_inout_place = 0;
97+
static int enable_out_of_place = 1;
9798

9899
#define NUM_BLOCKS 32
99100

@@ -653,8 +654,10 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
653654
char rootName[100];
654655
sprintf(rootName, "%6i", root);
655656
PRINT("%12li %12li %8s %6s %6s", std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
656-
TESTCHECK(BenchTime(args, type, op, root, 0));
657-
usleep(delay_inout_place);
657+
if (enable_out_of_place) {
658+
TESTCHECK(BenchTime(args, type, op, root, 0));
659+
usleep(delay_inout_place);
660+
}
658661
TESTCHECK(BenchTime(args, type, op, root, 1));
659662
PRINT("\n");
660663
}
@@ -795,6 +798,7 @@ int main(int argc, char* argv[]) {
795798
{"cudagraph", required_argument, 0, 'G'},
796799
{"report_cputime", required_argument, 0, 'C'},
797800
{"average", required_argument, 0, 'a'},
801+
{"out_of_place", required_argument, 0, 'O'},
798802
#ifdef RCCL_MULTIRANKPERGPU
799803
{"enable_multiranks", required_argument, 0, 'x'},
800804
{"ranks_per_gpu", required_argument, 0, 'R'},
@@ -807,9 +811,9 @@ int main(int argc, char* argv[]) {
807811
int c;
808812

809813
#ifdef RCCL_MULTIRANKPERGPU
810-
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:R:x:q:", longopts, &longindex);
814+
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:O:a:y:s:u:h:R:x:q:", longopts, &longindex);
811815
#else
812-
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:q:", longopts, &longindex);
816+
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:O:a:y:s:u:h:q:", longopts, &longindex);
813817
#endif
814818

815819
if (c == -1)
@@ -907,6 +911,9 @@ int main(int argc, char* argv[]) {
907911
case 'C':
908912
report_cputime = strtol(optarg, NULL, 0);
909913
break;
914+
case 'O':
915+
enable_out_of_place = strtol(optarg, NULL, 0);
916+
break;
910917
case 'a':
911918
average = (int)strtol(optarg, NULL, 0);
912919
break;
@@ -953,6 +960,7 @@ int main(int argc, char* argv[]) {
953960
"[-T,--timeout <time in seconds>] \n\t"
954961
"[-G,--cudagraph <num graph launches>] \n\t"
955962
"[-C,--report_cputime <0/1>] \n\t"
963+
"[-O,--out_of_place <0/1>] \n\t"
956964
"[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
957965
#ifdef RCCL_MULTIRANKPERGPU
958966
"[-x,--enable_multiranks <0/1> enable using multiple ranks per GPU] \n\t"
@@ -1173,11 +1181,19 @@ testResult_t run() {
11731181

11741182
const char* timeStr = report_cputime ? "cputime" : "time";
11751183
PRINT("#\n");
1176-
PRINT("# %10s %12s %8s %6s %6s out-of-place in-place \n", "", "", "", "", "");
1177-
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %6s %7s %6s %6s %6s\n", "size", "count", "type", "redop", "root",
1178-
timeStr, "algbw", "busbw", "#wrong", timeStr, "algbw", "busbw", "#wrong");
1179-
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "",
1180-
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
1184+
if (enable_out_of_place) {
1185+
PRINT("# %10s %12s %8s %6s %6s out-of-place in-place \n", "", "", "", "", "");
1186+
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %6s %7s %6s %6s %6s\n", "size", "count", "type", "redop", "root",
1187+
timeStr, "algbw", "busbw", "#wrong", timeStr, "algbw", "busbw", "#wrong");
1188+
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "",
1189+
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
1190+
} else {
1191+
PRINT("# %10s %12s %8s %6s %6s in-place \n", "", "", "", "", "");
1192+
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %6s\n", "size", "count", "type", "redop", "root",
1193+
timeStr, "algbw", "busbw", "#wrong");
1194+
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "",
1195+
"(us)", "(GB/s)", "(GB/s)", "");
1196+
}
11811197

11821198
struct testThread threads[nThreads];
11831199
memset(threads, 0, sizeof(struct testThread)*nThreads);
@@ -1205,7 +1221,7 @@ testResult_t run() {
12051221
threads[t].args.ncclId = ncclId;
12061222
threads[t].args.comms=comms+t*nGpus*ranksPerGpu;
12071223
threads[t].args.streams=streams+t*nGpus*ranksPerGpu;
1208-
1224+
threads[t].args.enable_out_of_place=enable_out_of_place;
12091225
threads[t].args.errors=errors+t;
12101226
threads[t].args.bw=bw+t;
12111227
threads[t].args.bw_count=bw_count+t;

‎src/common.h‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ struct threadArgs {
127127
int localRank;
128128
int localNumDevices;
129129
int enable_multiranks;
130+
int enable_out_of_place;
130131
int nRanks;
131132
void** sendbuffs;
132133
size_t sendBytes;

0 commit comments

Comments
 (0)