From e0d1995d87dfc4daa7c2635b47cdc3b8d464a8f2 Mon Sep 17 00:00:00 2001 From: Martin Schroschk <martin.schroschk@tu-dresden.de> Date: Fri, 4 Mar 2022 12:07:21 +0100 Subject: [PATCH] Provide long Slurm options --- .../docs/contrib/content_rules.md | 12 +++---- .../docs/jobs_and_resources/slurm.md | 10 +++--- .../docs/jobs_and_resources/slurm_examples.md | 34 +++++++++---------- .../docs/software/data_analytics_with_r.md | 4 +-- .../docs/software/distributed_training.md | 4 +-- .../docs/software/fem_software.md | 10 +++--- .../docs/software/nanoscale_simulations.md | 4 +-- .../software/python_virtual_environments.md | 2 +- .../docs/software/visualization.md | 31 +++++++++-------- 9 files changed, 56 insertions(+), 55 deletions(-) diff --git a/doc.zih.tu-dresden.de/docs/contrib/content_rules.md b/doc.zih.tu-dresden.de/docs/contrib/content_rules.md index bede100b0..b4c84ff84 100644 --- a/doc.zih.tu-dresden.de/docs/contrib/content_rules.md +++ b/doc.zih.tu-dresden.de/docs/contrib/content_rules.md @@ -198,9 +198,9 @@ Line numbers can be added via ```bash linenums="1" #!/bin/bash -#SBATCH -N 1 -#SBATCH -n 23 -#SBATCH -t 02:10:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=23 +#SBATCH --time=02:10:00 srun a.out ``` @@ -216,9 +216,9 @@ Specific Lines can be highlighted by using ```bash hl_lines="2 3" #!/bin/bash -#SBATCH -N 1 -#SBATCH -n 23 -#SBATCH -t 02:10:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=23 +#SBATCH --time=02:10:00 srun a.out ``` diff --git a/doc.zih.tu-dresden.de/docs/jobs_and_resources/slurm.md b/doc.zih.tu-dresden.de/docs/jobs_and_resources/slurm.md index 9fa70fef2..0fe309208 100644 --- a/doc.zih.tu-dresden.de/docs/jobs_and_resources/slurm.md +++ b/doc.zih.tu-dresden.de/docs/jobs_and_resources/slurm.md @@ -376,13 +376,13 @@ If you want to use your reservation, you have to add the parameter ## Node Features for Selective Job Submission -The nodes in our HPC system are becoming more diverse in multiple aspects: hardware, mounted -storage, software. The system administrators can describe the set of properties and it is up to the -user to specify her/his requirements. These features should be thought of as changing over time +The nodes in our HPC system are becoming more diverse in multiple aspects, e.g, hardware, mounted +storage, software. The system administrators can describe the set of properties and it is up to you +as user to specify the requirements. These features should be thought of as changing over time (e.g., a filesystem get stuck on a certain node). -A feature can be used with the Slurm option `--constrain` or `-C` like -`srun -C fs_lustre_scratch2 ...` with `srun` or `sbatch`. Combinations like +A feature can be used with the Slurm option `-C, --constrain=<ARG>` like +`srun --constraint=fs_lustre_scratch2 ...` with `srun` or `sbatch`. Combinations like `--constraint="fs_beegfs_global0`are allowed. For a detailed description of the possible constraints, please refer to the [Slurm documentation](https://slurm.schedmd.com/srun.html). diff --git a/doc.zih.tu-dresden.de/docs/jobs_and_resources/slurm_examples.md b/doc.zih.tu-dresden.de/docs/jobs_and_resources/slurm_examples.md index fecea7ad7..b7339d873 100644 --- a/doc.zih.tu-dresden.de/docs/jobs_and_resources/slurm_examples.md +++ b/doc.zih.tu-dresden.de/docs/jobs_and_resources/slurm_examples.md @@ -7,12 +7,12 @@ depend on the type of parallelization and architecture. ### OpenMP Jobs -An SMP-parallel job can only run within a node, so it is necessary to include the options `-N 1` and -`-n 1`. The maximum number of processors for an SMP-parallel program is 896 and 56 on partition -`taurussmp8` and `smp2`, respectively. Please refer to the +An SMP-parallel job can only run within a node, so it is necessary to include the options `--node=1` +and `--ntasks=1`. The maximum number of processors for an SMP-parallel program is 896 and 56 on +partition `taurussmp8` and `smp2`, respectively. Please refer to the [partitions section](partitions_and_limits.md#memory-limits) for up-to-date information. Using the option `--cpus-per-task=<N>` Slurm will start one task and you will have `N` CPUs available for your -job. An example job file would look like: +job. An example job file would look like: !!! example "Job file for OpenMP application" @@ -22,9 +22,9 @@ job. An example job file would look like: #SBATCH --tasks-per-node=1 #SBATCH --cpus-per-task=8 #SBATCH --time=08:00:00 - #SBATCH -J Science1 + #SBATCH --job-name=Science1 #SBATCH --mail-type=end - #SBATCH --mail-user=your.name@tu-dresden.de + #SBATCH --mail-user=<your.email>@tu-dresden.de export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK ./path/to/binary @@ -48,9 +48,9 @@ For MPI-parallel jobs one typically allocates one core per task that has to be s #!/bin/bash #SBATCH --ntasks=864 #SBATCH --time=08:00:00 - #SBATCH -J Science1 + #SBATCH --job-name=Science1 #SBATCH --mail-type=end - #SBATCH --mail-user=your.name@tu-dresden.de + #SBATCH --mail-user=<your.email>@tu-dresden.de srun ./path/to/binary ``` @@ -70,9 +70,9 @@ below. #SBATCH --ntasks=4 #SBATCH --cpus-per-task=1 #SBATCH --time=01:00:00 - #SBATCH -J PseudoParallelJobs + #SBATCH --job-name=PseudoParallelJobs #SBATCH --mail-type=end - #SBATCH --mail-user=your.name@tu-dresden.de + #SBATCH --mail-user=<your.email>@tu-dresden.de # The following sleep command was reported to fix warnings/errors with srun by users (feel free to uncomment). #sleep 5 @@ -109,7 +109,7 @@ for `sbatch/srun` in this case is `--gres=gpu:[NUM_PER_NODE]` (where `NUM_PER_NO #SBATCH --cpus-per-task=6 # use 6 threads per task #SBATCH --gres=gpu:1 # use 1 GPU per node (i.e. use one GPU per task) #SBATCH --time=01:00:00 # run for 1 hour - #SBATCH -A Project1 # account CPU time to Project1 + #SBATCH --account=Project1 # account CPU time to Project1 srun ./your/cuda/application # start you application (probably requires MPI to use both nodes) ``` @@ -247,7 +247,7 @@ two you might want to use. Since we use cgroups for separation of jobs, your job use more resources than requested.* If you just want to use all available cores in a node, you have to specify how Slurm should organize -them, like with `-p haswell -c 24` or `-p haswell --ntasks-per-node=24`. +them, like with `--partition=haswell --cpus-per-tasks=24` or `--partition=haswell --ntasks-per-node=24`. Here is a short example to ensure that a benchmark is not spoiled by other jobs, even if it doesn't use up all resources in the nodes: @@ -256,13 +256,13 @@ use up all resources in the nodes: ```Bash #!/bin/bash - #SBATCH -p haswell + #SBATCH --partition=haswell #SBATCH --nodes=2 #SBATCH --ntasks-per-node=2 #SBATCH --cpus-per-task=8 #SBATCH --exclusive # ensure that nobody spoils my measurement on 2 x 2 x 8 cores #SBATCH --time=00:10:00 - #SBATCH -J Benchmark + #SBATCH --job-name=Benchmark #SBATCH --mail-user=your.name@tu-dresden.de srun ./my_benchmark @@ -299,11 +299,11 @@ name specific to the job: ```Bash #!/bin/bash #SBATCH --array 0-9 - #SBATCH -o arraytest-%A_%a.out - #SBATCH -e arraytest-%A_%a.err + #SBATCH --output=arraytest-%A_%a.out + #SBATCH --error=arraytest-%A_%a.err #SBATCH --ntasks=864 #SBATCH --time=08:00:00 - #SBATCH -J Science1 + #SBATCH --job-name=Science1 #SBATCH --mail-type=end #SBATCH --mail-user=your.name@tu-dresden.de diff --git a/doc.zih.tu-dresden.de/docs/software/data_analytics_with_r.md b/doc.zih.tu-dresden.de/docs/software/data_analytics_with_r.md index afead82a8..1f6be0614 100644 --- a/doc.zih.tu-dresden.de/docs/software/data_analytics_with_r.md +++ b/doc.zih.tu-dresden.de/docs/software/data_analytics_with_r.md @@ -269,8 +269,8 @@ since both are running multicore jobs on a **single** node. Below is an example: #SBATCH --tasks-per-node=1 #SBATCH --cpus-per-task=16 #SBATCH --time=00:10:00 -#SBATCH -o test_Rmpi.out -#SBATCH -e test_Rmpi.err +#SBATCH --output=test_Rmpi.out +#SBATCH --error=test_Rmpi.err module purge module load modenv/scs5 diff --git a/doc.zih.tu-dresden.de/docs/software/distributed_training.md b/doc.zih.tu-dresden.de/docs/software/distributed_training.md index 41cd1dab3..4e8fc427e 100644 --- a/doc.zih.tu-dresden.de/docs/software/distributed_training.md +++ b/doc.zih.tu-dresden.de/docs/software/distributed_training.md @@ -128,10 +128,10 @@ Each worker runs the training loop independently. module load modenv/hiera GCC/10.2.0 CUDA/11.1.1 OpenMPI/4.0.5 TensorFlow/2.4.1 # On the first node - TF_CONFIG='{"cluster": {"worker": ["'"${NODE_1}"':33562", "'"${NODE_2}"':33561"]}, "task": {"index": 0, "type": "worker"}}' srun -w ${NODE_1} -N 1 --ntasks=1 --gres=gpu:1 python main_ddl.py & + TF_CONFIG='{"cluster": {"worker": ["'"${NODE_1}"':33562", "'"${NODE_2}"':33561"]}, "task": {"index": 0, "type": "worker"}}' srun --nodelist=${NODE_1} --nodes=1 --ntasks=1 --gres=gpu:1 python main_ddl.py & # On the second node - TF_CONFIG='{"cluster": {"worker": ["'"${NODE_1}"':33562", "'"${NODE_2}"':33561"]}, "task": {"index": 1, "type": "worker"}}' srun -w ${NODE_2} -N 1 --ntasks=1 --gres=gpu:1 python main_ddl.py & + TF_CONFIG='{"cluster": {"worker": ["'"${NODE_1}"':33562", "'"${NODE_2}"':33561"]}, "task": {"index": 1, "type": "worker"}}' srun --nodelist=${NODE_2} --nodes=1 --ntasks=1 --gres=gpu:1 python main_ddl.py & wait ``` diff --git a/doc.zih.tu-dresden.de/docs/software/fem_software.md b/doc.zih.tu-dresden.de/docs/software/fem_software.md index d8bffb0a7..d04e036b9 100644 --- a/doc.zih.tu-dresden.de/docs/software/fem_software.md +++ b/doc.zih.tu-dresden.de/docs/software/fem_software.md @@ -59,7 +59,7 @@ Slurm or [writing job files](../jobs_and_resources/slurm.md#job-files). #SBATCH --job-name=yyyy # give a name, what ever you want #SBATCH --mail-type=END,FAIL # send email when the job finished or failed #SBATCH --mail-user=<name>@mailbox.tu-dresden.de # set your email - #SBATCH -A p_xxxxxxx # charge compute time to your project + #SBATCH --account=p_xxxxxxx # charge compute time to your project # Abaqus has its own MPI @@ -75,7 +75,7 @@ Slurm or [writing job files](../jobs_and_resources/slurm.md#job-files). ``` 4. Control the status of the job ``` - marie@login squeue -u your_login # in column "ST" (Status) you will find a R=Running or P=Pending (waiting for resources) + marie@login squeue --me # in column "ST" (Status) you will find a R=Running or P=Pending (waiting for resources) ``` ## Ansys @@ -114,7 +114,7 @@ If more time is needed, a CPU has to be allocated like this (see ```console marie@login$ module load ANSYS/<version> -marie@login$ srun -t 00:30:00 --x11=first [SLURM_OPTIONS] --pty bash +marie@login$ srun --time=00:30:00 --x11=first [SLURM_OPTIONS] --pty bash [...] marie@login$ runwb2 ``` @@ -208,7 +208,7 @@ firewall of ZIH. For further information, please refer to the COMSOL manual. ```console marie@login$ module load COMSOL - marie@login$ srun -n 1 -c 4 --mem-per-cpu=2500 -t 8:00 comsol -np 4 server + marie@login$ srun --ntasks=1 --cpus-per-task=4 --mem-per-cpu=2500 --time=8:00 comsol -np 4 server ``` ??? example "Interactive Job" @@ -218,7 +218,7 @@ firewall of ZIH. For further information, please refer to the COMSOL manual. ```console marie@login$ module load COMSOL - marie@login$ srun -n 1 -c 4 --mem-per-cpu=2500 -t 8:00 --pty --x11=first comsol -np 4 + marie@login$ srun --ntasks=1 --cpus-per-task=4 --mem-per-cpu=2500 --time=8:00 --pty --x11=first comsol -np 4 ``` Please make sure, that the option *Preferences* --> Graphics --> *Renedering* is set to *software diff --git a/doc.zih.tu-dresden.de/docs/software/nanoscale_simulations.md b/doc.zih.tu-dresden.de/docs/software/nanoscale_simulations.md index 14018d374..9727d2d35 100644 --- a/doc.zih.tu-dresden.de/docs/software/nanoscale_simulations.md +++ b/doc.zih.tu-dresden.de/docs/software/nanoscale_simulations.md @@ -81,8 +81,8 @@ For runs with [Slurm](../jobs_and_resources/slurm.md), please use a script like ```Bash #!/bin/bash -#SBATCH -t 120 -#SBATCH -n 8 +#SBATCH --time=120 +#SBATCH --ntasks=8 #SBATCH --ntasks-per-node=2 ## you have to make sure that an even number of tasks runs on each node !! #SBATCH --mem-per-cpu=1900 diff --git a/doc.zih.tu-dresden.de/docs/software/python_virtual_environments.md b/doc.zih.tu-dresden.de/docs/software/python_virtual_environments.md index d6ae27186..13b623174 100644 --- a/doc.zih.tu-dresden.de/docs/software/python_virtual_environments.md +++ b/doc.zih.tu-dresden.de/docs/software/python_virtual_environments.md @@ -139,7 +139,7 @@ can deactivate the conda environment as follows: This is an example on partition Alpha. The example creates a virtual environment, and installs the package `torchvision` with pip. ```console - marie@login$ srun --partition=alpha-interactive -N=1 --gres=gpu:1 --time=01:00:00 --pty bash + marie@login$ srun --partition=alpha-interactive --nodes=1 --gres=gpu:1 --time=01:00:00 --pty bash marie@alpha$ mkdir python-environments # please use workspaces marie@alpha$ module load modenv/hiera GCC/10.2.0 CUDA/11.1.1 OpenMPI/4.0.5 PyTorch Module GCC/10.2.0, CUDA/11.1.1, OpenMPI/4.0.5, PyTorch/1.9.0 and 54 dependencies loaded. diff --git a/doc.zih.tu-dresden.de/docs/software/visualization.md b/doc.zih.tu-dresden.de/docs/software/visualization.md index 8116af22e..987e59b67 100644 --- a/doc.zih.tu-dresden.de/docs/software/visualization.md +++ b/doc.zih.tu-dresden.de/docs/software/visualization.md @@ -38,9 +38,10 @@ parallel, if it was built using MPI. ``` The resources for the MPI processes have to be allocated via the -[batch system](../jobs_and_resources/slurm.md) option `-c NUM` (not `-n`, as it would be usually for -MPI processes). It might be valuable in terms of runtime to bind/pin the MPI processes to hardware. -A convenient option is `-bind-to core`. All other options can be obtained by +[batch system](../jobs_and_resources/slurm.md) option `--cpus-per-task=<NUM>` (not `--ntasks=<NUM>`, +as it would be usually for MPI processes). It might be valuable in terms of runtime to bind/pin the +MPI processes to hardware. A convenient option is `-bind-to core`. All other options can be +obtained by ```console marie@login$ mpiexec -bind-to -help` @@ -57,8 +58,8 @@ interactive allocation. ```Bash #!/bin/bash - #SBATCH -N 1 - #SBATCH -c 12 + #SBATCH --nodes=1 + #SBATCH --cpus-per-task=12 #SBATCH --time=01:00:00 # Make sure to only use ParaView @@ -71,7 +72,7 @@ interactive allocation. ??? example "Example of interactive allocation using `salloc`" ```console - marie@login$ salloc -N 1 -c 16 --time=01:00:00 bash + marie@login$ salloc --nodes=1 --cpus-per-task=16 --time=01:00:00 bash salloc: Pending job allocation 336202 salloc: job 336202 queued and waiting for resources salloc: job 336202 has been allocated resources @@ -102,8 +103,8 @@ cards (GPUs) specified by the device index. For that, make sure to use the modul ```Bash #!/bin/bash - #SBATCH -N 1 - #SBATCH -c 12 + #SBATCH --nodes=1 + #SBATCH --cpus-per-task=12 #SBATCH --gres=gpu:2 #SBATCH --partition=gpu2 #SBATCH --time=01:00:00 @@ -133,7 +134,7 @@ handling. First, you need to open a DCV session, so please follow the instructio virtual desktop session, then load the ParaView module as usual and start the GUI: ```console -marie@dcv module load ParaView/5.7.0 +marie@dcv$ module load ParaView/5.7.0 paraview ``` @@ -156,7 +157,7 @@ processes. ```console marie@login$ module ParaView/5.7.0-osmesa - marie@login$ srun -N1 -n8 --mem-per-cpu=2500 -p interactive --pty pvserver --force-offscreen-rendering + marie@login$ srun --nodes=1 --ntasks=8 --mem-per-cpu=2500 --partition=interactive --pty pvserver --force-offscreen-rendering srun: job 2744818 queued and waiting for resources srun: job 2744818 has been allocated resources Waiting for client... @@ -188,8 +189,8 @@ marie@local$ ssh -L 22222:172.24.140.229:11111 taurus !!! important "SSH command" - The previous SSH command requires that you have already set up your [SSH configuration - ](../access/ssh_login.md#configuring-default-parameters-for-ssh). + The previous SSH command requires that you have already set up your + [SSH configuration](../access/ssh_login.md#configuring-default-parameters-for-ssh). The final step is to start ParaView locally on your own machine and add the connection @@ -239,8 +240,8 @@ it into thinking your provided GL rendering version is higher than what it actua ??? example - The following lines requires that you have already set up your [SSH configuration - ](../access/ssh_login.md#configuring-default-parameters-for-ssh). + The following lines requires that you have already set up your + [SSH configuration](../access/ssh_login.md#configuring-default-parameters-for-ssh). ```console # 1st, connect to ZIH systems using X forwarding (-X). @@ -252,5 +253,5 @@ it into thinking your provided GL rendering version is higher than what it actua marie@login$ export MESA_GL_VERSION_OVERRIDE=3.2 # 3rd, start the ParaView GUI inside an interactive job. Don't forget the --x11 parameter for X forwarding: - marie@login$ srun -n1 -c1 -p interactive --mem-per-cpu=2500 --pty --x11=first paraview + marie@login$ srun --ntasks=1 --cpus-per-task=1 --partition=interactive --mem-per-cpu=2500 --pty --x11=first paraview ``` -- GitLab