From ef94e413855bc9deff2660dcf11d36b67e5e4123 Mon Sep 17 00:00:00 2001
From: Natalie Breidenbach <natalie.breidenbach@tu-dresden.de>
Date: Tue, 28 Nov 2023 08:09:43 +0100
Subject: [PATCH] Update distributed_training.md

---
 .../docs/software/distributed_training.md            | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/doc.zih.tu-dresden.de/docs/software/distributed_training.md b/doc.zih.tu-dresden.de/docs/software/distributed_training.md
index 094b6f8dc..096281640 100644
--- a/doc.zih.tu-dresden.de/docs/software/distributed_training.md
+++ b/doc.zih.tu-dresden.de/docs/software/distributed_training.md
@@ -99,13 +99,12 @@ Each worker runs the training loop independently.
     TensorFlow is available as a module.
     Check for the version.
     The `TF_CONFIG` environment variable can be set as a prefix to the command.
-    Now, run the script on the partition `alpha` simultaneously on both nodes:
+    Now, run the script on the cluster `alpha` simultaneously on both nodes:
 
     ```bash
     #!/bin/bash
 
     #SBATCH --job-name=distr
-    #SBATCH --partition=alpha
     #SBATCH --output=%j.out
     #SBATCH --error=%j.err
     #SBATCH --mem=64000
@@ -121,8 +120,8 @@ Each worker runs the training loop independently.
     }
     NODE_1=$(print_nodelist | awk '{print $1}' | sort -u | head -n 1)
     NODE_2=$(print_nodelist | awk '{print $1}' | sort -u | tail -n 1)
-    IP_1=$(dig +short ${NODE_1}.taurus.hrsk.tu-dresden.de)
-    IP_2=$(dig +short ${NODE_2}.taurus.hrsk.tu-dresden.de)
+    IP_1=$(dig +short ${NODE_1}.alpha.hpc.tu-dresden.de)
+    IP_2=$(dig +short ${NODE_2}.alpha.hpc.tu-dresden.de)
 
     module load modenv/hiera
     module load modenv/hiera GCC/10.2.0 CUDA/11.1.1 OpenMPI/4.0.5 TensorFlow/2.4.1
@@ -257,7 +256,7 @@ marie@compute$ module spider Horovod           # Check available modules
 marie@compute$ module load Horovod/0.19.5-fosscuda-2019b-TensorFlow-2.2.0-Python-3.7.4
 ```
 
-Or if you want to use Horovod on the partition `alpha`, you can load it with the dependencies:
+Or if you want to use Horovod on the cluster `alpha`, you can load it with the dependencies:
 
 ```console
 marie@alpha$ module spider Horovod                         #Check available modules
@@ -324,7 +323,7 @@ Hello from: 0
     [official examples](https://github.com/horovod/horovod/tree/master/examples)
     to parallelize your code.
     In Horovod, each GPU gets pinned to a process.
-    You can easily start your job with the following bash script with four processes on two nodes:
+    You can easily start your job with the following bash script with four processes on two nodes using the cluster Power:
 
     ```bash
     #!/bin/bash
@@ -332,7 +331,6 @@ Hello from: 0
     #SBATCH --ntasks=4
     #SBATCH --ntasks-per-node=2
     #SBATCH --gres=gpu:2
-    #SBATCH --partition=ml
     #SBATCH --mem=250G
     #SBATCH --time=01:00:00
     #SBATCH --output=run_horovod.out
-- 
GitLab