Skip to content

Commit 29f4114

Browse files
committed
Fixes to all tests that divide buffers by nranks so that they trim buffer sizes to be multiples of 16 bytes.
This ensures non-pow2 ranks have buffer addresses aligned suitably for performance.
1 parent 8dfeab9 commit 29f4114

File tree

12 files changed

+36
-40
lines changed

12 files changed

+36
-40
lines changed

‎src/all_gather.cu‎

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,8 @@
77
#include "cuda_runtime.h"
88
#include "common.h"
99

10-
#define ALIGN 4
11-
12-
void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
13-
size_t base = (count/(ALIGN*nranks))*ALIGN;
10+
void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
11+
size_t base = (count/nranks) & -(16/eltSize);
1412
*sendcount = base;
1513
*recvcount = base*nranks;
1614
*sendInplaceOffset = base;
@@ -60,7 +58,7 @@ struct testColl allGatherTest = {
6058

6159
void AllGatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
6260
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
63-
AllGatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
61+
AllGatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
6462
}
6563

6664
testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {

‎src/all_reduce.cu‎

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#include "cuda_runtime.h"
88
#include "common.h"
99

10-
void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
10+
void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
1111
*sendcount = count;
1212
*recvcount = count;
1313
*sendInplaceOffset = 0;
@@ -55,7 +55,7 @@ struct testColl allReduceTest = {
5555

5656
void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
5757
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
58-
AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
58+
AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
5959
}
6060

6161
testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {

‎src/alltoall.cu‎

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@
77
#include "cuda_runtime.h"
88
#include "common.h"
99

10-
void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
11-
*sendcount = (count/nranks)*nranks;
12-
*recvcount = (count/nranks)*nranks;
10+
void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
11+
*paramcount = (count/nranks) & -(16/eltSize);
12+
*sendcount = nranks*(*paramcount);
13+
*recvcount = *sendcount;
1314
*sendInplaceOffset = 0;
1415
*recvInplaceOffset = 0;
15-
*paramcount = count/nranks;
1616
}
1717

1818
testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
@@ -74,7 +74,7 @@ struct testColl alltoAllTest = {
7474

7575
void AlltoAllGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
7676
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
77-
AlltoAllGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
77+
AlltoAllGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
7878
}
7979

8080
testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {

‎src/broadcast.cu‎

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#include "cuda_runtime.h"
88
#include "common.h"
99

10-
void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
10+
void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
1111
*sendcount = count;
1212
*recvcount = count;
1313
*sendInplaceOffset = 0;
@@ -64,7 +64,7 @@ struct testColl broadcastTest = {
6464

6565
void BroadcastGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
6666
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
67-
BroadcastGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
67+
BroadcastGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
6868
}
6969

7070
testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {

‎src/common.cu‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -571,7 +571,7 @@ void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) {
571571
size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset;
572572

573573
count = size / wordSize(type);
574-
args->collTest->getCollByteCount(&sendCount, &recvCount, &paramCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks);
574+
args->collTest->getCollByteCount(&sendCount, &recvCount, &paramCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, wordSize(type), (size_t)nranks);
575575

576576
args->nbytes = paramCount * wordSize(type);
577577
args->sendBytes = sendCount * wordSize(type);

‎src/common.h‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ struct testColl {
8787
void (*getCollByteCount)(
8888
size_t *sendcount, size_t *recvcount, size_t *paramcount,
8989
size_t *sendInplaceOffset, size_t *recvInplaceOffset,
90-
size_t count, int nranks);
90+
size_t count, size_t eltSize, int nranks);
9191
testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
9292
ncclRedOp_t op, int root, int rep, int in_place);
9393
void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);

‎src/gather.cu‎

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@
77
#include "cuda_runtime.h"
88
#include "common.h"
99

10-
void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
11-
*sendcount = count/nranks;
12-
*recvcount = (count/nranks)*nranks;
13-
*sendInplaceOffset = count/nranks;
10+
void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
11+
*sendcount = (count/nranks) & -(16/eltSize);
12+
*recvcount = (*sendcount)*nranks;
13+
*sendInplaceOffset = *sendcount;
1414
*recvInplaceOffset = 0;
15-
*paramcount = count/nranks;
15+
*paramcount = *sendcount;
1616
}
1717

1818
testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
@@ -73,7 +73,7 @@ struct testColl gatherTest = {
7373

7474
void GatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
7575
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
76-
GatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
76+
GatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
7777
}
7878

7979
testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {

‎src/hypercube.cu‎

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99

1010
#define ALIGN 4
1111

12-
void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
13-
size_t base = (count/(ALIGN*nranks))*ALIGN;
12+
void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
13+
size_t base = (count/nranks) & -(16/eltSize);
1414
*sendcount = base;
1515
*recvcount = base*nranks;
1616
*sendInplaceOffset = base;
@@ -78,7 +78,7 @@ struct testColl hyperCubeTest = {
7878

7979
void HyperCubeGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
8080
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
81-
HyperCubeGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
81+
HyperCubeGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
8282
}
8383

8484
testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {

‎src/reduce.cu‎

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#include "cuda_runtime.h"
88
#include "common.h"
99

10-
void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
10+
void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
1111
*sendcount = count;
1212
*recvcount = count;
1313
*sendInplaceOffset = 0;
@@ -54,7 +54,7 @@ struct testColl reduceTest = {
5454

5555
void ReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
5656
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
57-
ReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
57+
ReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
5858
}
5959

6060
testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {

‎src/reduce_scatter.cu‎

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,8 @@
77
#include "cuda_runtime.h"
88
#include "common.h"
99

10-
#define ALIGN 4
11-
12-
void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
13-
size_t base = (count/(ALIGN*nranks))*ALIGN;
10+
void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
11+
size_t base = (count/nranks) & -(16/eltSize);
1412
*sendcount = base*nranks;
1513
*recvcount = base;
1614
*sendInplaceOffset = 0;
@@ -59,7 +57,7 @@ struct testColl reduceScatterTest = {
5957

6058
void ReduceScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
6159
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
62-
ReduceScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
60+
ReduceScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
6361
}
6462

6563
testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {

0 commit comments

Comments
 (0)