NVIDIA
diff --git a/‎PyTorch/LanguageModeling/BERT/Dockerfile‎
Lines changed: 2 additions & 2 deletions b/‎PyTorch/LanguageModeling/BERT/Dockerfile‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎PyTorch/LanguageModeling/BERT/README.md‎
Lines changed: 200 additions & 129 deletions b/‎PyTorch/LanguageModeling/BERT/README.md‎
Lines changed: 200 additions & 129 deletions
diff --git a/‎PyTorch/LanguageModeling/BERT/bind.sh‎
Lines changed: 211 additions & 0 deletions b/‎PyTorch/LanguageModeling/BERT/bind.sh‎
Lines changed: 211 additions & 0 deletions
diff --git a/‎PyTorch/LanguageModeling/BERT/modeling.py‎
Lines changed: 7 additions & 0 deletions b/‎PyTorch/LanguageModeling/BERT/modeling.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎PyTorch/LanguageModeling/BERT/requirements.txt‎
Lines changed: 4 additions & 1 deletion b/‎PyTorch/LanguageModeling/BERT/requirements.txt‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎PyTorch/LanguageModeling/BERT/run.sub‎
Lines changed: 5 additions & 3 deletions b/‎PyTorch/LanguageModeling/BERT/run.sub‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎PyTorch/LanguageModeling/BERT/run_glue.py‎
Lines changed: 4 additions & 3 deletions b/‎PyTorch/LanguageModeling/BERT/run_glue.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎PyTorch/LanguageModeling/BERT/run_pretraining.py‎
Lines changed: 13 additions & 9 deletions b/‎PyTorch/LanguageModeling/BERT/run_pretraining.py‎
Lines changed: 13 additions & 9 deletions
@@ -11,8 +11,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.03-py3
-FROM nvcr.io/nvidia/tritonserver:20.03-py3-clientsdk as trt
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3
+FROM nvcr.io/nvidia/tritonserver:20.06-py3-clientsdk as trt
 FROM ${FROM_IMAGE_NAME}
 RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract
 
 
@@ -0,0 +1,211 @@
+#! /bin/bash
+set -euo pipefail
+
+print_usage() {
+    cat << EOF
+${0} [options] [--] COMMAND [ARG...]
+
+Control binding policy for each task. Assumes one rank will be launched for each GPU.
+
+Options:
+    --cpu=MODE
+        * exclusive -- bind each rank to an exclusive set of cores near its GPU
+        * exclusive,nosmt -- bind each rank to an exclusive set of cores near its GPU, without hyperthreading
+        * node -- bind each rank to all cores in the NUMA node nearest its GPU [default]
+	* *.sh -- bind each rank using the bash associative array bind_cpu_cores or bind_cpu_nodes from a file
+        * off -- don't bind
+    --mem=MODE
+        * node -- bind each rank to the nearest NUMA node [default]
+	* *.sh -- bind each rank using the bash associative array bind_mem from a file
+        * off -- don't bind
+    --ib=MODE
+        * single -- bind each rank to a single IB device near its GPU
+        * off -- donot bind [default]
+    --cluster=CLUSTER
+        Select which cluster is being used. May be required if system params cannot be detected.
+EOF
+}
+
+################################################################################
+# Argument parsing
+################################################################################
+
+cpu_mode='node'
+mem_mode='node'
+ib_mode='off'
+cluster=''
+while [ $# -gt 0 ]; do
+    case "$1" in
+        -h|--help) print_usage ; exit 0 ;;
+        --cpu=*) cpu_mode="${1/*=/}"; shift ;;
+        --cpu)   cpu_mode="$2"; shift 2 ;;
+        --mem=*) mem_mode="${1/*=/}"; shift ;;
+        --mem)   mem_mode="$2"; shift 2 ;;
+        --ib=*) ib_mode="${1/*=/}"; shift ;;
+        --ib)   ib_mode="$2"; shift 2 ;;
+        --cluster=*) cluster="${1/*=/}"; shift ;;
+        --cluster)   cluster="$2"; shift 2 ;;
+        --) shift; break ;;
+        *) break ;;
+    esac
+done
+if [ $# -lt 1 ]; then
+    echo 'ERROR: no command given' 2>&1
+    print_usage
+    exit 1
+fi
+
+################################################################################
+# Get system params
+################################################################################
+
+# LOCAL_RANK is set with an enroot hook for Pytorch containers
+# SLURM_LOCALID is set by Slurm
+# OMPI_COMM_WORLD_LOCAL_RANK is set by mpirun
+readonly local_rank="${LOCAL_RANK:=${SLURM_LOCALID:=${OMPI_COMM_WORLD_LOCAL_RANK:-}}}"
+if [ -z "${local_rank}" ]; then
+    echo 'ERROR: cannot read LOCAL_RANK from env' >&2
+    exit 1
+fi
+
+num_gpus=$(nvidia-smi -i 0 --query-gpu=count --format=csv,noheader,nounits)
+if [ "${local_rank}" -ge "${num_gpus}" ]; then
+    echo "ERROR: local rank is ${local_rank}, but there are only ${num_gpus} gpus available" >&2
+    exit 1
+fi
+
+get_lscpu_value() {
+    awk -F: "(\$1 == \"${1}\"){gsub(/ /, \"\", \$2); print \$2; found=1} END{exit found!=1}"
+}
+lscpu_out=$(lscpu)
+num_sockets=$(get_lscpu_value 'Socket(s)' <<< "${lscpu_out}")
+num_nodes=$(get_lscpu_value 'NUMA node(s)' <<< "${lscpu_out}")
+cores_per_socket=$(get_lscpu_value 'Core(s) per socket' <<< "${lscpu_out}")
+
+echo "num_sockets = ${num_sockets} num_nodes=${num_nodes} cores_per_socket=${cores_per_socket}"
+
+readonly cores_per_node=$(( (num_sockets * cores_per_socket) / num_nodes ))
+if [ ${num_gpus} -gt 1 ]; then
+    readonly gpus_per_node=$(( num_gpus / num_nodes ))
+else
+    readonly gpus_per_node=1
+fi
+readonly cores_per_gpu=$(( cores_per_node / gpus_per_node ))
+readonly local_node=$(( local_rank / gpus_per_node ))
+
+
+declare -a ibdevs=()
+case "${cluster}" in
+    circe)
+        # Need to specialize for circe because IB detection is hard
+        ibdevs=(mlx5_1 mlx5_2 mlx5_3 mlx5_4 mlx5_7 mlx5_8 mlx5_9 mlx5_10)
+        ;;
+   selene)
+        # Need to specialize for selene because IB detection is hard
+        ibdevs=(mlx5_0 mlx5_1 mlx5_2 mlx5_3 mlx5_6 mlx5_7 mlx5_8 mlx5_9)
+        ;;
+    '')
+        if ibstat_out="$(ibstat -l 2>/dev/null | sort -V)" ; then
+            mapfile -t ibdevs <<< "${ibstat_out}"
+        fi
+        ;;
+    *)
+        echo "ERROR: Unknown cluster '${cluster}'" >&2
+        exit 1
+        ;;
+esac
+readonly num_ibdevs="${#ibdevs[@]}"
+
+################################################################################
+# Setup for exec
+################################################################################
+
+declare -a numactl_args=()
+
+case "${cpu_mode}" in
+    exclusive)
+        numactl_args+=( "$(printf -- "--physcpubind=%u-%u,%u-%u" \
+            $(( local_rank * cores_per_gpu )) \
+            $(( (local_rank + 1) * cores_per_gpu - 1 )) \
+            $(( local_rank * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) )) \
+            $(( (local_rank + 1) * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) - 1 )) \
+        )" )
+        ;;
+    exclusive,nosmt)
+        numactl_args+=( "$(printf -- "--physcpubind=%u-%u" \
+            $(( local_rank * cores_per_gpu )) \
+            $(( (local_rank + 1) * cores_per_gpu - 1 )) \
+        )" )
+        ;;
+    node)
+        numactl_args+=( "--cpunodebind=${local_node}" )
+        ;;
+    *.sh)
+	source "${cpu_mode}"
+	if [ -n "${bind_cpu_cores:-}" ]; then
+	    numactl_args+=( "--physcpubind=${bind_cpu_cores[${local_rank}]}" )
+	elif [ -n "${bind_cpu_nodes:-}" ]; then
+	    numactl_args+=( "--cpunodebind=${bind_cpu_nodes[${local_rank}]}" )
+	else
+	    echo "ERROR: invalid CPU affinity file ${cpu_mode}." >&2
+	    exit 1
+	fi
+	;;
+    off|'')
+        ;;
+    *)
+        echo "ERROR: invalid cpu mode '${cpu_mode}'" 2>&1
+        print_usage
+        exit 1
+        ;;
+esac
+
+case "${mem_mode}" in
+    node)
+        numactl_args+=( "--membind=${local_node}" )
+        ;;
+    *.sh)
+	source "${mem_mode}"
+	if [ -z "${bind_mem:-}" ]; then
+	    echo "ERROR: invalid memory affinity file ${mem_mode}." >&2
+	    exit 1
+	fi
+	numactl_args+=( "--membind=${bind_mem[${local_rank}]}" )
+	;;
+    off|'')
+        ;;
+    *)
+        echo "ERROR: invalid mem mode '${mem_mode}'" 2>&1
+        print_usage
+        exit 1
+        ;;
+esac
+
+case "${ib_mode}" in
+    single)
+        if [ "${num_ibdevs}" -eq 0 ]; then
+            echo "WARNING: used '$0 --ib=single', but there are 0 IB devices available; skipping IB binding." 2>&1
+        else
+            readonly ibdev="${ibdevs[$(( local_rank * num_ibdevs / num_gpus ))]}"
+            export OMPI_MCA_btl_openib_if_include="${OMPI_MCA_btl_openib_if_include-$ibdev}"
+        fi
+        ;;
+    off|'')
+        ;;
+    *)
+        echo "ERROR: invalid ib mode '${ib_mode}'" 2>&1
+        print_usage
+        exit 1
+        ;;
+esac
+
+################################################################################
+# Exec
+################################################################################
+
+if [ "${#numactl_args[@]}" -gt 0 ] ; then
+    set -x
+    exec numactl "${numactl_args[@]}" -- "${@}"
+else
+    exec "${@}"
+fi
@@ -119,17 +119,24 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
 def gelu(x):
     return x * 0.5 * (1.0 + torch.erf(x / 1.41421))
 
+#used only for triton inference
 def bias_gelu(bias, y):
     x = bias + y
     return x * 0.5 * (1.0 + torch.erf(x / 1.41421))
 
+# used specifically for training since torch.nn.functional.gelu breaks ONNX export
+def bias_gelu_training(bias, y):
+    x = bias + y
+    return torch.nn.functional.gelu(x) # Breaks ONNX export
+
 def bias_tanh(bias, y):
     x = bias + y
     return torch.tanh(x)
 
 def swish(x):
     return x * torch.sigmoid(x)
 
+#torch.nn.functional.gelu(x) # Breaks ONNX export
 ACT2FN = {"gelu": gelu, "bias_gelu": bias_gelu, "bias_tanh": bias_tanh, "relu": torch.nn.functional.relu, "swish": swish}
 
 class LinearActivation(Module):
 
@@ -10,4 +10,7 @@ ipdb
 h5py
 html2text
 nltk
-progressbar
+progressbar
+#Others
+onnxruntime
+git+https://github.com/NVIDIA/dllogger
@@ -19,8 +19,8 @@
 set -eux
 
 # The following variables variables need to be set
-# Base container to be used  
-readonly docker_image="nvcr.io/nvidia/pytorch:19.10-py3"
+# Base container to be used - container built in step 1 on quick start guide 
+readonly docker_image="nvcr.io/nvidia/pytorch:20.06-py3"
 # Location of dataset for phase 1
 readonly datadir="/raid/datasets/bert/hdf5/shard_1472_test_split_10/seq_128_pred_20_dupe_5/training"
 # Location of dataset for phase 2
@@ -30,6 +30,8 @@ readonly checkpointdir="$PWD/checkpoints"
 
 readonly mounts=".:/workspace/bert,${datadir}:/workspace/data,${datadir_phase2}:/workspace/data_phase2,${checkpointdir}:/results"
 
+BIND_CMD="./bind.sh --cpu=exclusive --ib=single --"
+
 srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 mkdir -p "${checkpointdir}"
 
 PHASE1="\
@@ -59,7 +61,7 @@ PHASES=( "$PHASE1" "$PHASE2" )
 PHASE=${PHASE:-1}
 
 BERT_CMD="\
-    python -u /workspace/bert/run_pretraining.py \
+    ${BIND_CMD} python -u /workspace/bert/run_pretraining.py \
     --seed=42 \
     ${PHASES[$((PHASE-1))]} \
     --do_train \
 
@@ -33,7 +33,7 @@
 from tqdm import tqdm, trange
 
 from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
-from modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
+import modeling
 from tokenization import BertTokenizer
 from optimization import BertAdam, warmup_linear
 from schedulers import LinearWarmUpScheduler
@@ -552,12 +552,13 @@ def main():
             num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
 
     # Prepare model
-    config = BertConfig.from_json_file(args.config_file)
+    config = modeling.BertConfig.from_json_file(args.config_file)
     # Padding for divisibility by 8
     if config.vocab_size % 8 != 0:
         config.vocab_size += 8 - (config.vocab_size % 8)
 
-    model = BertForSequenceClassification(config, num_labels=num_labels)
+    modeling.ACT2FN["bias_gelu"] = modeling.bias_gelu_training
+    model = modeling.BertForSequenceClassification(config, num_labels=num_labels)
     print("USING CHECKPOINT from", args.init_checkpoint)
     model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')["model"], strict=False)
     print("USED CHECKPOINT from", args.init_checkpoint)
 
@@ -198,7 +198,7 @@ def parse_arguments():
                              "E.g., 0.1 = 10%% of training.")
     parser.add_argument("--local_rank",
                         type=int,
-                        default=-1,
+                        default=os.getenv('LOCAL_RANK', -1),
                         help="local_rank for distributed training on gpus")
     parser.add_argument('--seed',
                         type=int,
@@ -272,7 +272,13 @@ def parse_arguments():
                         default=False,
                         action='store_true',
                         help='Disable tqdm progress bar')
+    parser.add_argument('--steps_this_run', type=int, default=-1,
+                        help='If provided, only run this many steps before exiting')
+
     args = parser.parse_args()
+
+    if args.steps_this_run < 0:
+        args.steps_this_run = args.max_steps
 
     return args
 
@@ -291,7 +297,7 @@ def setup_training(args):
         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.distributed.init_process_group(backend='nccl', init_method='env://')
         args.n_gpu = 1
-
+        
     if args.gradient_accumulation_steps == 1:
         args.allreduce_post_accumulation = False
         args.allreduce_post_accumulation_fp16 = False
@@ -336,7 +342,7 @@ def prepare_model_and_optimizer(args, device):
     if config.vocab_size % 8 != 0:
         config.vocab_size += 8 - (config.vocab_size % 8)
 
-    modeling.ACT2FN["bias_gelu"] = torch.jit.script(modeling.ACT2FN["bias_gelu"])
+    modeling.ACT2FN["bias_gelu"] = modeling.bias_gelu_training
     model = modeling.BertForPreTraining(config)
 
     checkpoint = None
@@ -481,9 +487,6 @@ def main():
     global timeout_sent
 
     args = parse_arguments()
-
-    if args.use_env and 'LOCAL_RANK' in os.environ:
-        args.local_rank = int(os.environ['LOCAL_RANK'])
 
     random.seed(args.seed + args.local_rank)
     np.random.seed(args.seed + args.local_rank)
@@ -604,7 +607,7 @@ def main():
                         lr_scheduler.step()  # learning rate warmup
                         global_step = take_optimizer_step(args, optimizer, model, overflow_buf, global_step)
 
-                    if global_step >= args.max_steps:
+                    if global_step >= args.steps_this_run or timeout_sent:
                         train_time_raw = time.time() - raw_train_start
                         last_num_steps = int(training_steps / args.gradient_accumulation_steps) % args.log_freq
                         last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps
@@ -623,7 +626,8 @@ def main():
                                                                             "learning_rate": optimizer.param_groups[0]['lr']})
                         average_loss = 0
 
-                    if global_step >= args.max_steps or training_steps % (
+
+                    if global_step >= args.steps_this_run or training_steps % (
                             args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0 or timeout_sent:
                         if is_main_process() and not args.skip_checkpoint:
                             # Save a trained model
@@ -649,7 +653,7 @@ def main():
 
                         # Exiting the training due to hitting max steps, or being sent a 
                         # timeout from the cluster scheduler
-                        if global_step >= args.max_steps or timeout_sent:
+                        if global_step >= args.steps_this_run or timeout_sent:
                             del train_dataloader
                             # thread.join()
                             return args, final_loss, train_time_raw, global_step