|
7 | 7 | #include "cuda_runtime.h" |
8 | 8 | #include "common.h" |
9 | 9 |
|
10 | | -void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { |
11 | | - *sendcount = count/nranks; |
12 | | - *recvcount = (count/nranks)*nranks; |
13 | | - *sendInplaceOffset = count/nranks; |
| 10 | +void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) { |
| 11 | + *sendcount = (count/nranks) & -(16/eltSize); |
| 12 | + *recvcount = (*sendcount)*nranks; |
| 13 | + *sendInplaceOffset = *sendcount; |
14 | 14 | *recvInplaceOffset = 0; |
15 | | - *paramcount = count/nranks; |
| 15 | + *paramcount = *sendcount; |
16 | 16 | } |
17 | 17 |
|
18 | 18 | testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { |
@@ -73,7 +73,7 @@ struct testColl gatherTest = { |
73 | 73 |
|
74 | 74 | void GatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { |
75 | 75 | size_t paramcount, sendInplaceOffset, recvInplaceOffset; |
76 | | - GatherGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); |
| 76 | + GatherGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks); |
77 | 77 | } |
78 | 78 |
|
79 | 79 | testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { |
|
0 commit comments