Skip to content

Commit 96138d5

Browse files
committed
[BERT/TF] Updating for Ampere
1 parent 24b8c9c commit 96138d5

51 files changed

Lines changed: 1797 additions & 877 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

‎PyTorch/LanguageModeling/BERT/Dockerfile‎

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
# See the License for the specific language governing permissions and
1212
# limitations under the License.
1313

14-
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.03-py3
15-
FROM nvcr.io/nvidia/tritonserver:20.03-py3-clientsdk as trt
14+
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3
15+
FROM nvcr.io/nvidia/tritonserver:20.06-py3-clientsdk as trt
1616
FROM ${FROM_IMAGE_NAME}
1717
RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract
1818

‎PyTorch/LanguageModeling/BERT/README.md‎

Lines changed: 200 additions & 129 deletions
Large diffs are not rendered by default.
Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
#! /bin/bash
2+
set -euo pipefail
3+
4+
print_usage() {
5+
cat << EOF
6+
${0} [options] [--] COMMAND [ARG...]
7+
8+
Control binding policy for each task. Assumes one rank will be launched for each GPU.
9+
10+
Options:
11+
--cpu=MODE
12+
* exclusive -- bind each rank to an exclusive set of cores near its GPU
13+
* exclusive,nosmt -- bind each rank to an exclusive set of cores near its GPU, without hyperthreading
14+
* node -- bind each rank to all cores in the NUMA node nearest its GPU [default]
15+
* *.sh -- bind each rank using the bash associative array bind_cpu_cores or bind_cpu_nodes from a file
16+
* off -- don't bind
17+
--mem=MODE
18+
* node -- bind each rank to the nearest NUMA node [default]
19+
* *.sh -- bind each rank using the bash associative array bind_mem from a file
20+
* off -- don't bind
21+
--ib=MODE
22+
* single -- bind each rank to a single IB device near its GPU
23+
* off -- donot bind [default]
24+
--cluster=CLUSTER
25+
Select which cluster is being used. May be required if system params cannot be detected.
26+
EOF
27+
}
28+
29+
################################################################################
30+
# Argument parsing
31+
################################################################################
32+
33+
cpu_mode='node'
34+
mem_mode='node'
35+
ib_mode='off'
36+
cluster=''
37+
while [ $# -gt 0 ]; do
38+
case "$1" in
39+
-h|--help) print_usage ; exit 0 ;;
40+
--cpu=*) cpu_mode="${1/*=/}"; shift ;;
41+
--cpu) cpu_mode="$2"; shift 2 ;;
42+
--mem=*) mem_mode="${1/*=/}"; shift ;;
43+
--mem) mem_mode="$2"; shift 2 ;;
44+
--ib=*) ib_mode="${1/*=/}"; shift ;;
45+
--ib) ib_mode="$2"; shift 2 ;;
46+
--cluster=*) cluster="${1/*=/}"; shift ;;
47+
--cluster) cluster="$2"; shift 2 ;;
48+
--) shift; break ;;
49+
*) break ;;
50+
esac
51+
done
52+
if [ $# -lt 1 ]; then
53+
echo 'ERROR: no command given' 2>&1
54+
print_usage
55+
exit 1
56+
fi
57+
58+
################################################################################
59+
# Get system params
60+
################################################################################
61+
62+
# LOCAL_RANK is set with an enroot hook for Pytorch containers
63+
# SLURM_LOCALID is set by Slurm
64+
# OMPI_COMM_WORLD_LOCAL_RANK is set by mpirun
65+
readonly local_rank="${LOCAL_RANK:=${SLURM_LOCALID:=${OMPI_COMM_WORLD_LOCAL_RANK:-}}}"
66+
if [ -z "${local_rank}" ]; then
67+
echo 'ERROR: cannot read LOCAL_RANK from env' >&2
68+
exit 1
69+
fi
70+
71+
num_gpus=$(nvidia-smi -i 0 --query-gpu=count --format=csv,noheader,nounits)
72+
if [ "${local_rank}" -ge "${num_gpus}" ]; then
73+
echo "ERROR: local rank is ${local_rank}, but there are only ${num_gpus} gpus available" >&2
74+
exit 1
75+
fi
76+
77+
get_lscpu_value() {
78+
awk -F: "(\$1 == \"${1}\"){gsub(/ /, \"\", \$2); print \$2; found=1} END{exit found!=1}"
79+
}
80+
lscpu_out=$(lscpu)
81+
num_sockets=$(get_lscpu_value 'Socket(s)' <<< "${lscpu_out}")
82+
num_nodes=$(get_lscpu_value 'NUMA node(s)' <<< "${lscpu_out}")
83+
cores_per_socket=$(get_lscpu_value 'Core(s) per socket' <<< "${lscpu_out}")
84+
85+
echo "num_sockets = ${num_sockets} num_nodes=${num_nodes} cores_per_socket=${cores_per_socket}"
86+
87+
readonly cores_per_node=$(( (num_sockets * cores_per_socket) / num_nodes ))
88+
if [ ${num_gpus} -gt 1 ]; then
89+
readonly gpus_per_node=$(( num_gpus / num_nodes ))
90+
else
91+
readonly gpus_per_node=1
92+
fi
93+
readonly cores_per_gpu=$(( cores_per_node / gpus_per_node ))
94+
readonly local_node=$(( local_rank / gpus_per_node ))
95+
96+
97+
declare -a ibdevs=()
98+
case "${cluster}" in
99+
circe)
100+
# Need to specialize for circe because IB detection is hard
101+
ibdevs=(mlx5_1 mlx5_2 mlx5_3 mlx5_4 mlx5_7 mlx5_8 mlx5_9 mlx5_10)
102+
;;
103+
selene)
104+
# Need to specialize for selene because IB detection is hard
105+
ibdevs=(mlx5_0 mlx5_1 mlx5_2 mlx5_3 mlx5_6 mlx5_7 mlx5_8 mlx5_9)
106+
;;
107+
'')
108+
if ibstat_out="$(ibstat -l 2>/dev/null | sort -V)" ; then
109+
mapfile -t ibdevs <<< "${ibstat_out}"
110+
fi
111+
;;
112+
*)
113+
echo "ERROR: Unknown cluster '${cluster}'" >&2
114+
exit 1
115+
;;
116+
esac
117+
readonly num_ibdevs="${#ibdevs[@]}"
118+
119+
################################################################################
120+
# Setup for exec
121+
################################################################################
122+
123+
declare -a numactl_args=()
124+
125+
case "${cpu_mode}" in
126+
exclusive)
127+
numactl_args+=( "$(printf -- "--physcpubind=%u-%u,%u-%u" \
128+
$(( local_rank * cores_per_gpu )) \
129+
$(( (local_rank + 1) * cores_per_gpu - 1 )) \
130+
$(( local_rank * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) )) \
131+
$(( (local_rank + 1) * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) - 1 )) \
132+
)" )
133+
;;
134+
exclusive,nosmt)
135+
numactl_args+=( "$(printf -- "--physcpubind=%u-%u" \
136+
$(( local_rank * cores_per_gpu )) \
137+
$(( (local_rank + 1) * cores_per_gpu - 1 )) \
138+
)" )
139+
;;
140+
node)
141+
numactl_args+=( "--cpunodebind=${local_node}" )
142+
;;
143+
*.sh)
144+
source "${cpu_mode}"
145+
if [ -n "${bind_cpu_cores:-}" ]; then
146+
numactl_args+=( "--physcpubind=${bind_cpu_cores[${local_rank}]}" )
147+
elif [ -n "${bind_cpu_nodes:-}" ]; then
148+
numactl_args+=( "--cpunodebind=${bind_cpu_nodes[${local_rank}]}" )
149+
else
150+
echo "ERROR: invalid CPU affinity file ${cpu_mode}." >&2
151+
exit 1
152+
fi
153+
;;
154+
off|'')
155+
;;
156+
*)
157+
echo "ERROR: invalid cpu mode '${cpu_mode}'" 2>&1
158+
print_usage
159+
exit 1
160+
;;
161+
esac
162+
163+
case "${mem_mode}" in
164+
node)
165+
numactl_args+=( "--membind=${local_node}" )
166+
;;
167+
*.sh)
168+
source "${mem_mode}"
169+
if [ -z "${bind_mem:-}" ]; then
170+
echo "ERROR: invalid memory affinity file ${mem_mode}." >&2
171+
exit 1
172+
fi
173+
numactl_args+=( "--membind=${bind_mem[${local_rank}]}" )
174+
;;
175+
off|'')
176+
;;
177+
*)
178+
echo "ERROR: invalid mem mode '${mem_mode}'" 2>&1
179+
print_usage
180+
exit 1
181+
;;
182+
esac
183+
184+
case "${ib_mode}" in
185+
single)
186+
if [ "${num_ibdevs}" -eq 0 ]; then
187+
echo "WARNING: used '$0 --ib=single', but there are 0 IB devices available; skipping IB binding." 2>&1
188+
else
189+
readonly ibdev="${ibdevs[$(( local_rank * num_ibdevs / num_gpus ))]}"
190+
export OMPI_MCA_btl_openib_if_include="${OMPI_MCA_btl_openib_if_include-$ibdev}"
191+
fi
192+
;;
193+
off|'')
194+
;;
195+
*)
196+
echo "ERROR: invalid ib mode '${ib_mode}'" 2>&1
197+
print_usage
198+
exit 1
199+
;;
200+
esac
201+
202+
################################################################################
203+
# Exec
204+
################################################################################
205+
206+
if [ "${#numactl_args[@]}" -gt 0 ] ; then
207+
set -x
208+
exec numactl "${numactl_args[@]}" -- "${@}"
209+
else
210+
exec "${@}"
211+
fi

‎PyTorch/LanguageModeling/BERT/modeling.py‎

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,17 +119,24 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
119119
def gelu(x):
120120
return x * 0.5 * (1.0 + torch.erf(x / 1.41421))
121121

122+
#used only for triton inference
122123
def bias_gelu(bias, y):
123124
x = bias + y
124125
return x * 0.5 * (1.0 + torch.erf(x / 1.41421))
125126

127+
# used specifically for training since torch.nn.functional.gelu breaks ONNX export
128+
def bias_gelu_training(bias, y):
129+
x = bias + y
130+
return torch.nn.functional.gelu(x) # Breaks ONNX export
131+
126132
def bias_tanh(bias, y):
127133
x = bias + y
128134
return torch.tanh(x)
129135

130136
def swish(x):
131137
return x * torch.sigmoid(x)
132138

139+
#torch.nn.functional.gelu(x) # Breaks ONNX export
133140
ACT2FN = {"gelu": gelu, "bias_gelu": bias_gelu, "bias_tanh": bias_tanh, "relu": torch.nn.functional.relu, "swish": swish}
134141

135142
class LinearActivation(Module):

‎PyTorch/LanguageModeling/BERT/requirements.txt‎

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,7 @@ ipdb
1010
h5py
1111
html2text
1212
nltk
13-
progressbar
13+
progressbar
14+
#Others
15+
onnxruntime
16+
git+https://github.com/NVIDIA/dllogger

‎PyTorch/LanguageModeling/BERT/run.sub‎

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
set -eux
2020

2121
# The following variables variables need to be set
22-
# Base container to be used
23-
readonly docker_image="nvcr.io/nvidia/pytorch:19.10-py3"
22+
# Base container to be used - container built in step 1 on quick start guide
23+
readonly docker_image="nvcr.io/nvidia/pytorch:20.06-py3"
2424
# Location of dataset for phase 1
2525
readonly datadir="/raid/datasets/bert/hdf5/shard_1472_test_split_10/seq_128_pred_20_dupe_5/training"
2626
# Location of dataset for phase 2
@@ -30,6 +30,8 @@ readonly checkpointdir="$PWD/checkpoints"
3030

3131
readonly mounts=".:/workspace/bert,${datadir}:/workspace/data,${datadir_phase2}:/workspace/data_phase2,${checkpointdir}:/results"
3232

33+
BIND_CMD="./bind.sh --cpu=exclusive --ib=single --"
34+
3335
srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 mkdir -p "${checkpointdir}"
3436

3537
PHASE1="\
@@ -59,7 +61,7 @@ PHASES=( "$PHASE1" "$PHASE2" )
5961
PHASE=${PHASE:-1}
6062

6163
BERT_CMD="\
62-
python -u /workspace/bert/run_pretraining.py \
64+
${BIND_CMD} python -u /workspace/bert/run_pretraining.py \
6365
--seed=42 \
6466
${PHASES[$((PHASE-1))]} \
6567
--do_train \

‎PyTorch/LanguageModeling/BERT/run_glue.py‎

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
from tqdm import tqdm, trange
3434

3535
from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
36-
from modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
36+
import modeling
3737
from tokenization import BertTokenizer
3838
from optimization import BertAdam, warmup_linear
3939
from schedulers import LinearWarmUpScheduler
@@ -552,12 +552,13 @@ def main():
552552
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
553553

554554
# Prepare model
555-
config = BertConfig.from_json_file(args.config_file)
555+
config = modeling.BertConfig.from_json_file(args.config_file)
556556
# Padding for divisibility by 8
557557
if config.vocab_size % 8 != 0:
558558
config.vocab_size += 8 - (config.vocab_size % 8)
559559

560-
model = BertForSequenceClassification(config, num_labels=num_labels)
560+
modeling.ACT2FN["bias_gelu"] = modeling.bias_gelu_training
561+
model = modeling.BertForSequenceClassification(config, num_labels=num_labels)
561562
print("USING CHECKPOINT from", args.init_checkpoint)
562563
model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')["model"], strict=False)
563564
print("USED CHECKPOINT from", args.init_checkpoint)

‎PyTorch/LanguageModeling/BERT/run_pretraining.py‎

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ def parse_arguments():
198198
"E.g., 0.1 = 10%% of training.")
199199
parser.add_argument("--local_rank",
200200
type=int,
201-
default=-1,
201+
default=os.getenv('LOCAL_RANK', -1),
202202
help="local_rank for distributed training on gpus")
203203
parser.add_argument('--seed',
204204
type=int,
@@ -272,7 +272,13 @@ def parse_arguments():
272272
default=False,
273273
action='store_true',
274274
help='Disable tqdm progress bar')
275+
parser.add_argument('--steps_this_run', type=int, default=-1,
276+
help='If provided, only run this many steps before exiting')
277+
275278
args = parser.parse_args()
279+
280+
if args.steps_this_run < 0:
281+
args.steps_this_run = args.max_steps
276282

277283
return args
278284

@@ -291,7 +297,7 @@ def setup_training(args):
291297
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
292298
torch.distributed.init_process_group(backend='nccl', init_method='env://')
293299
args.n_gpu = 1
294-
300+
295301
if args.gradient_accumulation_steps == 1:
296302
args.allreduce_post_accumulation = False
297303
args.allreduce_post_accumulation_fp16 = False
@@ -336,7 +342,7 @@ def prepare_model_and_optimizer(args, device):
336342
if config.vocab_size % 8 != 0:
337343
config.vocab_size += 8 - (config.vocab_size % 8)
338344

339-
modeling.ACT2FN["bias_gelu"] = torch.jit.script(modeling.ACT2FN["bias_gelu"])
345+
modeling.ACT2FN["bias_gelu"] = modeling.bias_gelu_training
340346
model = modeling.BertForPreTraining(config)
341347

342348
checkpoint = None
@@ -481,9 +487,6 @@ def main():
481487
global timeout_sent
482488

483489
args = parse_arguments()
484-
485-
if args.use_env and 'LOCAL_RANK' in os.environ:
486-
args.local_rank = int(os.environ['LOCAL_RANK'])
487490

488491
random.seed(args.seed + args.local_rank)
489492
np.random.seed(args.seed + args.local_rank)
@@ -604,7 +607,7 @@ def main():
604607
lr_scheduler.step() # learning rate warmup
605608
global_step = take_optimizer_step(args, optimizer, model, overflow_buf, global_step)
606609

607-
if global_step >= args.max_steps:
610+
if global_step >= args.steps_this_run or timeout_sent:
608611
train_time_raw = time.time() - raw_train_start
609612
last_num_steps = int(training_steps / args.gradient_accumulation_steps) % args.log_freq
610613
last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps
@@ -623,7 +626,8 @@ def main():
623626
"learning_rate": optimizer.param_groups[0]['lr']})
624627
average_loss = 0
625628

626-
if global_step >= args.max_steps or training_steps % (
629+
630+
if global_step >= args.steps_this_run or training_steps % (
627631
args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0 or timeout_sent:
628632
if is_main_process() and not args.skip_checkpoint:
629633
# Save a trained model
@@ -649,7 +653,7 @@ def main():
649653

650654
# Exiting the training due to hitting max steps, or being sent a
651655
# timeout from the cluster scheduler
652-
if global_step >= args.max_steps or timeout_sent:
656+
if global_step >= args.steps_this_run or timeout_sent:
653657
del train_dataloader
654658
# thread.join()
655659
return args, final_loss, train_time_raw, global_step

0 commit comments

Comments
 (0)