From a691b7479a1c4695b957c38a6da7d97c48195a05 Mon Sep 17 00:00:00 2001 From: Emicia <veronika.scholz@tu-dresden.de> Date: Wed, 29 Sep 2021 10:41:03 +0200 Subject: [PATCH] corrections on the tensorflow script --- .../docs/software/distributed_training.md | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/doc.zih.tu-dresden.de/docs/software/distributed_training.md b/doc.zih.tu-dresden.de/docs/software/distributed_training.md index 9bc1bb24c..6ea30f7bd 100644 --- a/doc.zih.tu-dresden.de/docs/software/distributed_training.md +++ b/doc.zih.tu-dresden.de/docs/software/distributed_training.md @@ -99,19 +99,22 @@ in this case on two nodes. TensorFlow is available as a module. Check for the version. The tf_config environment variable can be set as a prefix to the command. -Now, run the script simultaneously on both nodes: +Now, run the script on the sub-cluster "Alpha Centauri" simultaneously on both nodes: ```bash #!/bin/bash + #SBATCH -J distr -#SBATCH -p ml -#SBATCH --mem=250G -#SBATCH --nodes=2 -#SBATCH --ntasks=2 +#SBATCH -p alpha +#SBATCH --output=%j.out +#SBATCH --error=%j.err +#SBATCH --mem=64000 +#SBATCH -N 2 +#SBATCH -n 2 #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=14 #SBATCH --gres=gpu:1 -#SBATCH --time=0:20:00 +#SBATCH --time=01:00:00 function print_nodelist { scontrol show hostname $SLURM_NODELIST @@ -121,14 +124,16 @@ NODE_2=$(print_nodelist | awk '{print $1}' | sort -u | tail -n 1) IP_1=$(dig +short ${NODE_1}.taurus.hrsk.tu-dresden.de) IP_2=$(dig +short ${NODE_2}.taurus.hrsk.tu-dresden.de) -module load modenv/ml -module load TensorFlow +module load modenv/hiera +module load modenv/hiera GCC/10.2.0 CUDA/11.1.1 OpenMPI/4.0.5 TensorFlow/2.4.1 # On the first node -srun -w ${NODE_1} --nodes=1 --ntasks=1 --gres=gpu:1 TF_CONFIG='{"cluster": {"worker": ["${NODE_1}:33562", "${NODE_2}:33561"]}, "task": {"index": 0, "type": "worker"}}' python worker.py & +TF_CONFIG='{"cluster": {"worker": ["'"${NODE_1}"':33562", "'"${NODE_2}"':33561"]}, "task": {"index": 0, "type": "worker"}}' srun -w ${NODE_1} -N 1 --ntasks=1 --gres=gpu:1 python main_ddl.py & # On the second node -srun -w ${NODE_2} --nodes=1 --ntasks=1 --gres=gpu:1 TF_CONFIG='{"cluster": {"worker": ["${NODE_1}:33562", "${NODE_2}:33561"]}, "task": {"index": 1, "type": "worker"}}' python worker.py +TF_CONFIG='{"cluster": {"worker": ["'"${NODE_1}"':33562", "'"${NODE_2}"':33561"]}, "task": {"index": 1, "type": "worker"}}' srun -w ${NODE_2} -N 1 --ntasks=1 --gres=gpu:1 python main_ddl.py & + +wait ``` ### Distributed PyTorch -- GitLab