From ef94e413855bc9deff2660dcf11d36b67e5e4123 Mon Sep 17 00:00:00 2001 From: Natalie Breidenbach <natalie.breidenbach@tu-dresden.de> Date: Tue, 28 Nov 2023 08:09:43 +0100 Subject: [PATCH] Update distributed_training.md --- .../docs/software/distributed_training.md | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/doc.zih.tu-dresden.de/docs/software/distributed_training.md b/doc.zih.tu-dresden.de/docs/software/distributed_training.md index 094b6f8dc..096281640 100644 --- a/doc.zih.tu-dresden.de/docs/software/distributed_training.md +++ b/doc.zih.tu-dresden.de/docs/software/distributed_training.md @@ -99,13 +99,12 @@ Each worker runs the training loop independently. TensorFlow is available as a module. Check for the version. The `TF_CONFIG` environment variable can be set as a prefix to the command. - Now, run the script on the partition `alpha` simultaneously on both nodes: + Now, run the script on the cluster `alpha` simultaneously on both nodes: ```bash #!/bin/bash #SBATCH --job-name=distr - #SBATCH --partition=alpha #SBATCH --output=%j.out #SBATCH --error=%j.err #SBATCH --mem=64000 @@ -121,8 +120,8 @@ Each worker runs the training loop independently. } NODE_1=$(print_nodelist | awk '{print $1}' | sort -u | head -n 1) NODE_2=$(print_nodelist | awk '{print $1}' | sort -u | tail -n 1) - IP_1=$(dig +short ${NODE_1}.taurus.hrsk.tu-dresden.de) - IP_2=$(dig +short ${NODE_2}.taurus.hrsk.tu-dresden.de) + IP_1=$(dig +short ${NODE_1}.alpha.hpc.tu-dresden.de) + IP_2=$(dig +short ${NODE_2}.alpha.hpc.tu-dresden.de) module load modenv/hiera module load modenv/hiera GCC/10.2.0 CUDA/11.1.1 OpenMPI/4.0.5 TensorFlow/2.4.1 @@ -257,7 +256,7 @@ marie@compute$ module spider Horovod # Check available modules marie@compute$ module load Horovod/0.19.5-fosscuda-2019b-TensorFlow-2.2.0-Python-3.7.4 ``` -Or if you want to use Horovod on the partition `alpha`, you can load it with the dependencies: +Or if you want to use Horovod on the cluster `alpha`, you can load it with the dependencies: ```console marie@alpha$ module spider Horovod #Check available modules @@ -324,7 +323,7 @@ Hello from: 0 [official examples](https://github.com/horovod/horovod/tree/master/examples) to parallelize your code. In Horovod, each GPU gets pinned to a process. - You can easily start your job with the following bash script with four processes on two nodes: + You can easily start your job with the following bash script with four processes on two nodes using the cluster Power: ```bash #!/bin/bash @@ -332,7 +331,6 @@ Hello from: 0 #SBATCH --ntasks=4 #SBATCH --ntasks-per-node=2 #SBATCH --gres=gpu:2 - #SBATCH --partition=ml #SBATCH --mem=250G #SBATCH --time=01:00:00 #SBATCH --output=run_horovod.out -- GitLab