@@ -94,6 +94,7 @@ static int numDevices = 1;
9494static int ranksPerGpu = 1 ;
9595static int enable_multiranks = 0 ;
9696static int delay_inout_place = 0 ;
97+ static int enable_out_of_place = 1 ;
9798
9899#define NUM_BLOCKS 32
99100
@@ -653,8 +654,10 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
653654 char rootName[100 ];
654655 sprintf (rootName, " %6i" , root);
655656 PRINT (" %12li %12li %8s %6s %6s" , std::max (args->sendBytes , args->expectedBytes ), args->nbytes / wordSize (type), typeName, opName, rootName);
656- TESTCHECK (BenchTime (args, type, op, root, 0 ));
657- usleep (delay_inout_place);
657+ if (enable_out_of_place) {
658+ TESTCHECK (BenchTime (args, type, op, root, 0 ));
659+ usleep (delay_inout_place);
660+ }
658661 TESTCHECK (BenchTime (args, type, op, root, 1 ));
659662 PRINT (" \n " );
660663 }
@@ -795,6 +798,7 @@ int main(int argc, char* argv[]) {
795798 {" cudagraph" , required_argument, 0 , ' G' },
796799 {" report_cputime" , required_argument, 0 , ' C' },
797800 {" average" , required_argument, 0 , ' a' },
801+ {" out_of_place" , required_argument, 0 , ' O' },
798802#ifdef RCCL_MULTIRANKPERGPU
799803 {" enable_multiranks" , required_argument, 0 , ' x' },
800804 {" ranks_per_gpu" , required_argument, 0 , ' R' },
@@ -807,9 +811,9 @@ int main(int argc, char* argv[]) {
807811 int c;
808812
809813#ifdef RCCL_MULTIRANKPERGPU
810- c = getopt_long (argc, argv, " t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:R:x:q:" , longopts, &longindex);
814+ c = getopt_long (argc, argv, " t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:O: a:y:s:u:h:R:x:q:" , longopts, &longindex);
811815#else
812- c = getopt_long (argc, argv, " t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:q:" , longopts, &longindex);
816+ c = getopt_long (argc, argv, " t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:O: a:y:s:u:h:q:" , longopts, &longindex);
813817#endif
814818
815819 if (c == -1 )
@@ -907,6 +911,9 @@ int main(int argc, char* argv[]) {
907911 case ' C' :
908912 report_cputime = strtol (optarg, NULL , 0 );
909913 break ;
914+ case ' O' :
915+ enable_out_of_place = strtol (optarg, NULL , 0 );
916+ break ;
910917 case ' a' :
911918 average = (int )strtol (optarg, NULL , 0 );
912919 break ;
@@ -953,6 +960,7 @@ int main(int argc, char* argv[]) {
953960 " [-T,--timeout <time in seconds>] \n\t "
954961 " [-G,--cudagraph <num graph launches>] \n\t "
955962 " [-C,--report_cputime <0/1>] \n\t "
963+ " [-O,--out_of_place <0/1>] \n\t "
956964 " [-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t "
957965#ifdef RCCL_MULTIRANKPERGPU
958966 " [-x,--enable_multiranks <0/1> enable using multiple ranks per GPU] \n\t "
@@ -1173,11 +1181,19 @@ testResult_t run() {
11731181
11741182 const char * timeStr = report_cputime ? " cputime" : " time" ;
11751183 PRINT (" #\n " );
1176- PRINT (" # %10s %12s %8s %6s %6s out-of-place in-place \n " , " " , " " , " " , " " , " " );
1177- PRINT (" # %10s %12s %8s %6s %6s %7s %6s %6s %6s %7s %6s %6s %6s\n " , " size" , " count" , " type" , " redop" , " root" ,
1178- timeStr, " algbw" , " busbw" , " #wrong" , timeStr, " algbw" , " busbw" , " #wrong" );
1179- PRINT (" # %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n " , " (B)" , " (elements)" , " " , " " , " " ,
1180- " (us)" , " (GB/s)" , " (GB/s)" , " " , " (us)" , " (GB/s)" , " (GB/s)" , " " );
1184+ if (enable_out_of_place) {
1185+ PRINT (" # %10s %12s %8s %6s %6s out-of-place in-place \n " , " " , " " , " " , " " , " " );
1186+ PRINT (" # %10s %12s %8s %6s %6s %7s %6s %6s %6s %7s %6s %6s %6s\n " , " size" , " count" , " type" , " redop" , " root" ,
1187+ timeStr, " algbw" , " busbw" , " #wrong" , timeStr, " algbw" , " busbw" , " #wrong" );
1188+ PRINT (" # %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n " , " (B)" , " (elements)" , " " , " " , " " ,
1189+ " (us)" , " (GB/s)" , " (GB/s)" , " " , " (us)" , " (GB/s)" , " (GB/s)" , " " );
1190+ } else {
1191+ PRINT (" # %10s %12s %8s %6s %6s in-place \n " , " " , " " , " " , " " , " " );
1192+ PRINT (" # %10s %12s %8s %6s %6s %7s %6s %6s %6s\n " , " size" , " count" , " type" , " redop" , " root" ,
1193+ timeStr, " algbw" , " busbw" , " #wrong" );
1194+ PRINT (" # %10s %12s %8s %6s %6s %7s %6s %6s %5s\n " , " (B)" , " (elements)" , " " , " " , " " ,
1195+ " (us)" , " (GB/s)" , " (GB/s)" , " " );
1196+ }
11811197
11821198 struct testThread threads[nThreads];
11831199 memset (threads, 0 , sizeof (struct testThread )*nThreads);
@@ -1205,7 +1221,7 @@ testResult_t run() {
12051221 threads[t].args .ncclId = ncclId;
12061222 threads[t].args .comms =comms+t*nGpus*ranksPerGpu;
12071223 threads[t].args .streams =streams+t*nGpus*ranksPerGpu;
1208-
1224+ threads[t]. args . enable_out_of_place =enable_out_of_place;
12091225 threads[t].args .errors =errors+t;
12101226 threads[t].args .bw =bw+t;
12111227 threads[t].args .bw_count =bw_count+t;
0 commit comments