forked from IBM/dolomite-engine
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpretrain.sh
21 lines (19 loc) · 857 Bytes
/
pretrain.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# export PYTHONFAULTHANDLER=1
# export NCCL_DEBUG="INFO"
# export NCCL_DEBUG_FILE="$LOG_PATH/NCCL_DEBUG.%h.%p.txt"
# export NCCL_TOPO_DUMP_FILE="$LOG_PATH/NCCL_TOP.%h.xml"
export NCCL_SOCKET_IFNAME="ib,bond"
export NCCL_IB_CUDA_SUPPORT=1
MASTER_ADDRESS=$(echo ${LSB_MCPU_HOSTS} | tr ' ' '\n' | head -n 1)
MASTER_PORT=5${LSB_JOBID: -5:-1}
NNODES=$(echo ${LSB_MCPU_HOSTS} | tr ' ' '\n' | sed 'n; d' | wc -w)
GPUS_PER_NODE=$(echo $CUDA_VISIBLE_DEVICES | tr ',' '\n' | wc -w)
NODE_RANK=$(($(echo ${LSB_MCPU_HOSTS} | tr ' ' '\n' | sed 'n; d' | grep -n -m1 $(echo $HOSTNAME | cut -d'.' -f1) | cut -d':' -f1)-1))
TOKENIZERS_PARALLELISM=false \
torchrun --nnodes=$NNODES \
--node_rank=$NODE_RANK \
--nproc_per_node=$GPUS_PER_NODE \
--rdzv_id=101 \
--rdzv_endpoint=$MASTER_ADDRESS:$MASTER_PORT \
-m dolomite_engine.pretrain \
--config ${1}