diff --git a/Dockerfile b/Dockerfile index e0121582e4dffee8ed997d687ec390dd8ca117c2..5755c2b769928cd4047cc16c9a40a1769f0a04e2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,7 +6,7 @@ SHELL ["/bin/bash", "-c"] # Base # ######## -RUN pip install mkdocs>=1.1.2 mkdocs-material>=7.1.0 mkdocs-htmlproofer-plugin==0.8.0 mkdocs-video==1.3.0 +RUN pip install mkdocs>=1.1.2 mkdocs-material==8.5.11 mkdocs-htmlproofer-plugin==0.8.0 mkdocs-video==1.3.0 ########## # Linter # @@ -14,7 +14,7 @@ RUN pip install mkdocs>=1.1.2 mkdocs-material>=7.1.0 mkdocs-htmlproofer-plugin== RUN apt-get update && apt-get install -y nodejs npm aspell git git-lfs -RUN npm install -g markdownlint-cli markdown-link-check +RUN npm install -g markdownlint-cli@0.32.2 markdown-link-check ########################################### # prepare git for automatic merging in CI # @@ -38,6 +38,9 @@ RUN echo 'test \! -e /docs/tud_theme/javascripts/mermaid.min.js && test -x /docs RUN echo 'exec "$@"' >> /entrypoint.sh RUN chmod u+x /entrypoint.sh +# Workaround https://gitlab.com/gitlab-org/gitlab-runner/-/issues/29022 +RUN git config --global --add safe.directory /docs + WORKDIR /docs CMD ["mkdocs", "build", "--verbose", "--strict"] diff --git a/doc.zih.tu-dresden.de/docs/access/jupyterhub_custom_environments.md b/doc.zih.tu-dresden.de/docs/access/jupyterhub_custom_environments.md index 9fe57b98a82ec0b1cc5a68ed42c6a115ed68b4e5..86b8486e26b8c1fdfeedb9821e36d4ce9e544d8b 100644 --- a/doc.zih.tu-dresden.de/docs/access/jupyterhub_custom_environments.md +++ b/doc.zih.tu-dresden.de/docs/access/jupyterhub_custom_environments.md @@ -1,4 +1,4 @@ -# Creating and Using a Custom Environment for JupyterHub +# Custom Environments for JupyterHub !!! info diff --git a/doc.zih.tu-dresden.de/docs/access/ssh_login.md b/doc.zih.tu-dresden.de/docs/access/ssh_login.md index c5e2c6a21fc98799ac0726a22407e1ff1193377b..760c752a2399cc8a41c5f7a921bfada5ba473465 100644 --- a/doc.zih.tu-dresden.de/docs/access/ssh_login.md +++ b/doc.zih.tu-dresden.de/docs/access/ssh_login.md @@ -1,4 +1,4 @@ -# Connecting via terminal +# Connecting via Terminal (Linux, Mac, Windows) Connecting via terminal works on every operating system. For Linux and Mac operating systems no additional software is required. For users of a Windows OS a recent version of Windows is diff --git a/doc.zih.tu-dresden.de/docs/access/ssh_mobaxterm.md b/doc.zih.tu-dresden.de/docs/access/ssh_mobaxterm.md index 791f7578956ba1848b9f8ae6650a3f93f061a849..85d5f9cfc1e55078ba9c79e881d7156dacf9ca0a 100644 --- a/doc.zih.tu-dresden.de/docs/access/ssh_mobaxterm.md +++ b/doc.zih.tu-dresden.de/docs/access/ssh_mobaxterm.md @@ -1,13 +1,11 @@ -# Connecting from Windows with MobaXterm +# Connecting with MobaXterm (Windows) -MobaXterm is an enhanced terminal for Windows with an X11 server, a tabbed SSH client, network -tools and more. - -Visit its homepage for more information (https://mobaxterm.mobatek.net). +[MobaXterm](https://mobaxterm.mobatek.net) is an enhanced terminal for Windows with an X11 server, +a tabbed SSH client, network tools and more. ## Download and install -To download go to [MobaXterm homepage](https://mobaxterm.mobatek.net/download-home-edition.html) +To download go to [MobaXterm download page](https://mobaxterm.mobatek.net/download-home-edition.html) and download a free home edition.  diff --git a/doc.zih.tu-dresden.de/docs/access/ssh_putty.md b/doc.zih.tu-dresden.de/docs/access/ssh_putty.md index f6f390868a78670d339162705ef2f79057d4ec01..28105fef81b555f6a4af28fc8fdcc128797ea65a 100644 --- a/doc.zih.tu-dresden.de/docs/access/ssh_putty.md +++ b/doc.zih.tu-dresden.de/docs/access/ssh_putty.md @@ -1,4 +1,4 @@ -# Connecting from Windows with PuTTY +# Connecting with PuTTY (Windows) PuTTY is a free and open-source terminal emulator, serial console and network file transfer application, supports several network protocols, including SCP, SSH. Visit the diff --git a/doc.zih.tu-dresden.de/docs/application/acknowledgement.md b/doc.zih.tu-dresden.de/docs/application/acknowledgement.md index 2cbb89c2b88afe5933da677f53676e600387ce1c..dfe9fd0e42bc6d97aadada80e499ac9dac2d3b50 100644 --- a/doc.zih.tu-dresden.de/docs/application/acknowledgement.md +++ b/doc.zih.tu-dresden.de/docs/application/acknowledgement.md @@ -1,4 +1,4 @@ -# Acknowledgment +# Acknowledgement To provide you with modern and powerful HPC systems in future as well, we have to show that these systems help to advance research. For that purpose we rely on your help. In most cases, the results diff --git a/doc.zih.tu-dresden.de/docs/application/terms_of_use.md b/doc.zih.tu-dresden.de/docs/application/terms_of_use.md index 45d1566ec3ca55869f7a02f83ac60f6cd8f1708a..7410198950182fe0eada76d4e02885899c803c77 100644 --- a/doc.zih.tu-dresden.de/docs/application/terms_of_use.md +++ b/doc.zih.tu-dresden.de/docs/application/terms_of_use.md @@ -1,4 +1,4 @@ -# Terms Of Use / Nutzungsbedingungen +# Terms of Use !!! attention diff --git a/doc.zih.tu-dresden.de/docs/archive/beegfs_on_demand.md b/doc.zih.tu-dresden.de/docs/archive/beegfs_on_demand.md index 55a6e106dcc42a62050cc75759ccc943f2dd688c..9ccbb99dd41d26e69845455e88f527640d1560b8 100644 --- a/doc.zih.tu-dresden.de/docs/archive/beegfs_on_demand.md +++ b/doc.zih.tu-dresden.de/docs/archive/beegfs_on_demand.md @@ -1,4 +1,4 @@ -# BeeGFS Filesystem (Outdated) +# BeeGFS Filesystem on Demand (Outdated) !!! warning diff --git a/doc.zih.tu-dresden.de/docs/archive/no_ib_jobs.md b/doc.zih.tu-dresden.de/docs/archive/no_ib_jobs.md index 79fb6dcdde89c460671c1b7adb9cc7f7101973ef..8bf78779bdedf8b68d7efd60999878373da3edea 100644 --- a/doc.zih.tu-dresden.de/docs/archive/no_ib_jobs.md +++ b/doc.zih.tu-dresden.de/docs/archive/no_ib_jobs.md @@ -1,4 +1,4 @@ -# Jobs without Infiniband (Outdated) +# Jobs without InfiniBand (Outdated) !!! warning diff --git a/doc.zih.tu-dresden.de/docs/archive/systems_switched_off.md b/doc.zih.tu-dresden.de/docs/archive/systems_switched_off.md index 4f1319e046f05621aa155a64260fb4327942f4e1..929bcd44de2e225a4cd8c9b030c5fa90d89f7174 100644 --- a/doc.zih.tu-dresden.de/docs/archive/systems_switched_off.md +++ b/doc.zih.tu-dresden.de/docs/archive/systems_switched_off.md @@ -1,4 +1,4 @@ -# Hardware (Outdated) +# Switched-Off Systems (Outdated) HPC at ZIH has a quite long history and several systems have been installed and operated. Documentation on former systems for future reference can be found on the following pages: diff --git a/doc.zih.tu-dresden.de/docs/data_lifecycle/lustre.md b/doc.zih.tu-dresden.de/docs/data_lifecycle/lustre.md index 87263d924319cb7d3aae6bdf16c55eb3d0551afc..0c980b79f9135a775d0135ce943d3e2e8c494a8e 100644 --- a/doc.zih.tu-dresden.de/docs/data_lifecycle/lustre.md +++ b/doc.zih.tu-dresden.de/docs/data_lifecycle/lustre.md @@ -108,7 +108,7 @@ line mode within this documentation. !!! hint "Filesystem vs. Path" If you provide a path to the lfs commands instead of a filesystem, the lfs option is applied to - the filesystem this path is in. Thus, the provied information refers to the whole filesystem, + the filesystem this path is in. Thus, the passed information refers to the whole filesystem, not the path. You can retrieve a complete list of available options: diff --git a/doc.zih.tu-dresden.de/docs/data_lifecycle/workspaces.md b/doc.zih.tu-dresden.de/docs/data_lifecycle/workspaces.md index 29645325abeaaa97b0e6d8120b553136dddaaf14..a450eb22b3ea7cadb756d78bb15bcb5c04773de1 100644 --- a/doc.zih.tu-dresden.de/docs/data_lifecycle/workspaces.md +++ b/doc.zih.tu-dresden.de/docs/data_lifecycle/workspaces.md @@ -28,7 +28,8 @@ times. ### List Available Filesystems -To list all available filesystems for using workspaces, use: +To list all available filesystems for using workspaces, you can either invoke `ws_list -l` or +`ws_find -l`, e.g., ```console marie@login$ ws_find -l @@ -42,12 +43,13 @@ beegfs !!! note "Default is `scratch`" - The default filesystem is `scratch`. If you prefer another filesystem, provide the option `-F - <fs>` to the workspace commands. + The default filesystem is `scratch`. If you prefer another filesystem (cf. section + [List Available Filesystems](#list-available-filesystems)), you have to explictly + provide the option `-F <fs>` to the workspace commands. ### List Current Workspaces -To list all workspaces you currently own, use: +The command `ws_list` lists all your currently active (,i.e, not expired) workspaces, e.g. ```console marie@login$ ws_list @@ -60,13 +62,84 @@ id: test-workspace available extensions : 10 ``` +The output of `ws_list` can be customized via several options. The following switch tab provides a +overview of some of these options. All available options can be queried by `ws_list --help`. + +=== "Certain filesystem" + + ``` + marie@login$ ws_list --filesystem scratch_fast + id: numbercrunch + workspace directory : /lustre/ssd/ws/marie-numbercrunch + remaining time : 2 days 23 hours + creation time : Thu Mar 2 14:15:33 2023 + expiration date : Sun Mar 5 14:15:33 2023 + filesystem name : ssd + available extensions : 2 + ``` + +=== "Verbose output" + + ``` + marie@login$ ws_list -v + id: test-workspace + workspace directory : /scratch/ws/0/marie-test-workspace + remaining time : 89 days 23 hours + creation time : Thu Jul 29 10:30:04 2021 + expiration date : Wed Oct 27 10:30:04 2021 + filesystem name : scratch + available extensions : 10 + acctcode : p_numbercrunch + reminder : Sat Oct 20 10:30:04 2021 + mailaddress : marie@tu-dresden.de + ``` + +=== "Terse output" + + ``` + marie@login$ ws_list -t + id: test-workspace + workspace directory : /scratch/ws/0/marie-test-workspace + remaining time : 89 days 23 hours + available extensions : 10 + id: foo + workspace directory : /scratch/ws/0/marie-foo + remaining time : 3 days 22 hours + available extensions : 10 + ``` + +=== "Show only names" + + ``` + marie@login$ ws_list -s + test-workspace + foo + ``` + +=== "Sort by remaining time" + + You can list your currently allocated workspace by remaining time. This is especially useful + for housekeeping tasks, such as extending soon expiring workspaces if necessary. + + ``` + marie@login$ ws_list -R -t + id: test-workspace + workspace directory : /scratch/ws/0/marie-test-workspace + remaining time : 89 days 23 hours + available extensions : 10 + id: foo + workspace directory : /scratch/ws/0/marie-foof + remaining time : 3 days 22 hours + available extensions : 10 + ``` + ### Allocate a Workspace -To create a workspace in one of the listed filesystems, use `ws_allocate`. It is necessary to +To allocate a workspace in one of the listed filesystems, use `ws_allocate`. It is necessary to specify a unique name and the duration of the workspace. ```console -marie@login$ ws_allocate: [options] workspace_name duration +ws_allocate: [options] workspace_name duration Options: -h [ --help] produce help message @@ -95,10 +168,19 @@ Options: This will create a workspace with the name `test-workspace` on the `/scratch` filesystem for 90 days with an email reminder for 7 days before the expiration. -!!! Note +!!! Note "Email reminder" Setting the reminder to `7` means you will get a reminder email on every day starting `7` days - prior to expiration date. + prior to expiration date. We strongly recommend to set this email reminder. + +!!! Note "Name of a workspace" + + The workspace name should help you to remember the experiment and data stored here. It has to + be unique on a certain filesystem. On the other hand it is possible to use the very same name + for workspaces on different filesystems. + +Please refer to the section [section Cooperative Usage](#cooperative-usage-group-workspaces) for +group workspaces. ### Extension of a Workspace @@ -202,7 +284,7 @@ It performs the following steps once per day and filesystem: ### Restoring Expired Workspaces At expiration time your workspace will be moved to a special, hidden directory. For a month (in -warm_archive: 2 months), you can still restore your data into an existing workspace. +warm_archive: 2 months), you can still restore your data **into an existing workspace**. !!! warning @@ -256,7 +338,7 @@ There are three typical options for the use of workspaces: ### Per-Job Storage -The idea of a "workspace per-job storage" adresses the need of a batch job for a directory for +The idea of a "workspace per-job storage" addresses the need of a batch job for a directory for temporary data which can be deleted afterwards. To help you to write your own [(Slurm) job file](../jobs_and_resources/slurm.md#job-files), suited to your needs, we came up with the following example (which works [for the program g16](../software/nanoscale_simulations.md)). @@ -392,6 +474,57 @@ marie@login$ qinfo quota /warm_archive/ws/ Note that the workspaces reside under the mountpoint `/warm_archive/ws/` and not `/warm_archive` anymore. +## Cooperative Usage (Group Workspaces) + +When a workspace is created with the option `-g, --group`, it gets a group workspace that is visible +to others (if in the same group) via `ws_list -g`. + +!!! hint "Chose group" + + If you are member of multiple groups, than the group workspace is visible for your primary + group. You can list all groups you belong to via `groups`, and the first entry is your + primary group. + + Nevertheless, you can create a group workspace for any of your groups following these two + steps: + + 1. Change to the desired group using `newgrp <other-group>`. + 1. Create the group workspace as usual, i.e., `ws_allocate --group [...]` + + The [page on Sharing Data](data_sharing.md) provides + information on how to grant access to certain colleagues and whole project groups. + +!!! Example "Allocate and list group workspaces" + + If Marie wants to share results and scripts in a workspace with all of her colleagues + in the project `p_number_crunch`, she can allocate a so-called group workspace. + + ```console + marie@login$ ws_allocate --group --name numbercrunch --duration 30 + Info: creating workspace. + /scratch/ws/0/marie-numbercrunch + remaining extensions : 10 + remaining time in days: 30 + ``` + + This workspace directory is readable for the group, e.g., + + ```console + marie@login$ ls -ld /scratch/ws/0/marie-numbercrunch + drwxr-x--- 2 marie p_number_crunch 4096 Mar 2 15:24 /scratch/ws/0/marie-numbercrunch + ``` + + All members of the project group `p_number_crunch` can now list this workspace using + `ws_list -g` and access the data (read-only). + + ```console + martin@login$ ws_list -g -t + id: numbercrunch + workspace directory : /scratch/ws/0/marie-numbercrunch + remaining time : 29 days 23 hours + available extensions : 10 + ``` + ## FAQ and Troubleshooting **Q**: I am getting the error `Error: could not create workspace directory!` diff --git a/doc.zih.tu-dresden.de/docs/data_transfer/datamover.md b/doc.zih.tu-dresden.de/docs/data_transfer/datamover.md index 28aba7bbfdcec8411f6510061d509c949d128f34..52eb38531de1a52971b46960663c091a0df56879 100644 --- a/doc.zih.tu-dresden.de/docs/data_transfer/datamover.md +++ b/doc.zih.tu-dresden.de/docs/data_transfer/datamover.md @@ -1,4 +1,4 @@ -# Datamover - Data Transfer Inside ZIH Systems +# Transfer Data Inside ZIH Systems with Datamover With the **Datamover**, we provide a special data transfer machine for transferring data with best transfer speed between the filesystems of ZIH systems. The Datamover machine is not accessible diff --git a/doc.zih.tu-dresden.de/docs/data_transfer/export_nodes.md b/doc.zih.tu-dresden.de/docs/data_transfer/export_nodes.md index b4a22c95e3193bc9ff1a7c43b107fe5f7f74f953..36cfb088dfb2865a25bdfab9f8c0c8ab644138b3 100644 --- a/doc.zih.tu-dresden.de/docs/data_transfer/export_nodes.md +++ b/doc.zih.tu-dresden.de/docs/data_transfer/export_nodes.md @@ -1,4 +1,4 @@ -# Export Nodes - Data Transfer to/from ZIH Systems +# Transfer Data to/from ZIH Systems via Export Nodes To copy large data to/from ZIH systems, the so-called **export nodes** should be used. While it is possible to transfer small files directly via the login nodes, they are not intended to be used that diff --git a/doc.zih.tu-dresden.de/docs/jobs_and_resources/hardware_overview.md b/doc.zih.tu-dresden.de/docs/jobs_and_resources/hardware_overview.md index 1d06a620e89ae43b796286e6add849644beae530..c4bd1c7909fda4fa27703c00c68e284be07a4cb0 100644 --- a/doc.zih.tu-dresden.de/docs/jobs_and_resources/hardware_overview.md +++ b/doc.zih.tu-dresden.de/docs/jobs_and_resources/hardware_overview.md @@ -31,7 +31,7 @@ users and the ZIH. - 3.5 TB local memory on NVMe device at `/tmp` - Hostnames: `taurusi[8001-8034]` - Slurm partition: `alpha` -- Further information on the usage is documented on the site [AMD Rome Nodes](rome_nodes.md) +- Further information on the usage is documented on the site [Alpha Centauri Nodes](alpha_centauri.md) ## Island 7 - AMD Rome CPUs diff --git a/doc.zih.tu-dresden.de/docs/jobs_and_resources/mpi_issues.md b/doc.zih.tu-dresden.de/docs/jobs_and_resources/mpi_issues.md index bcabe289e3390e0ea6915ef33fb05eb1cff97fef..ccb34da378591594991ab746915fd90e9847920b 100644 --- a/doc.zih.tu-dresden.de/docs/jobs_and_resources/mpi_issues.md +++ b/doc.zih.tu-dresden.de/docs/jobs_and_resources/mpi_issues.md @@ -1,4 +1,4 @@ -# Known MPI-Usage Issues +# Known Issues when Using MPI This pages holds known issues observed with MPI and concrete MPI implementations. diff --git a/doc.zih.tu-dresden.de/docs/jobs_and_resources/rome_nodes.md b/doc.zih.tu-dresden.de/docs/jobs_and_resources/rome_nodes.md index 905110c775721ded6ce280ef069b0b05e7ce146f..4347dd6b0e64005a67f4c60627a2002138a00631 100644 --- a/doc.zih.tu-dresden.de/docs/jobs_and_resources/rome_nodes.md +++ b/doc.zih.tu-dresden.de/docs/jobs_and_resources/rome_nodes.md @@ -1,4 +1,4 @@ -# Island 7 - AMD Rome Nodes +# AMD Rome Nodes The hardware specification is documented on the page [HPC Resources](hardware_overview.md#island-7-amd-rome-cpus). diff --git a/doc.zih.tu-dresden.de/docs/jobs_and_resources/slurm.md b/doc.zih.tu-dresden.de/docs/jobs_and_resources/slurm.md index f4cdcd9de79a45aa10a32f4e5bdb2b4edcde5419..adaf75cdf9a356307f023a85620fbc9f482dc019 100644 --- a/doc.zih.tu-dresden.de/docs/jobs_and_resources/slurm.md +++ b/doc.zih.tu-dresden.de/docs/jobs_and_resources/slurm.md @@ -328,8 +328,8 @@ specifications for each component of the heterogeneous job should be separated w Running a job step on a specific component is supported by the option `--het-group`. ```console -marie@login$ salloc --ntasks 1 --cpus-per-task 4 --partition <partition> --mem=200G : \ - --ntasks 8 --cpus-per-task 1 --gres=gpu:8 --mem=80G --partition <partition> +marie@login$ salloc --ntasks=1 --cpus-per-task=4 --partition <partition> --mem=200G : \ + --ntasks=8 --cpus-per-task=1 --gres=gpu:8 --mem=80G --partition <partition> [...] marie@login$ srun ./my_application <args for master tasks> : ./my_application <args for worker tasks> ``` @@ -340,16 +340,16 @@ components by a line containing the directive `#SBATCH hetjob`. ```bash #!/bin/bash -#SBATCH --ntasks 1 -#SBATCH --cpus-per-task 4 -#SBATCH --partition <partition> +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=4 +#SBATCH --partition=<partition> #SBATCH --mem=200G #SBATCH hetjob # required to separate groups -#SBATCH --ntasks 8 -#SBATCH --cpus-per-task 1 +#SBATCH --ntasks=8 +#SBATCH --cpus-per-task=1 #SBATCH --gres=gpu:8 #SBATCH --mem=80G -#SBATCH --partition <partition> +#SBATCH --partition=<partition> srun ./my_application <args for master tasks> : ./my_application <args for worker tasks> @@ -474,7 +474,7 @@ at no extra cost. ??? example "Show all jobs since the beginning of year 2021" ```console - marie@login$ sacct -S 2021-01-01 [-E now] + marie@login$ sacct --starttime 2021-01-01 [--endtime now] ``` ## Jobs at Reservations @@ -501,24 +501,21 @@ as user to specify the requirements. These features should be thought of as chan (e.g., a filesystem get stuck on a certain node). A feature can be used with the Slurm option `-C, --constraint=<ARG>` like -`srun --constraint=fs_lustre_scratch2 ...` with `srun` or `sbatch`. Combinations like -`--constraint="fs_beegfs_global0`are allowed. For a detailed description of the possible -constraints, please refer to the [Slurm documentation](https://slurm.schedmd.com/srun.html). +`srun --constraint="fs_lustre_scratch2" [...]` with `srun` or `sbatch`. + +Multiple features can also be combined using AND, OR, matching OR, resource count etc. +E.g., `--constraint="fs_beegfs|fs_lustre_ssd"` requests for nodes with at least one of the +features `fs_beegfs` and `fs_lustre_ssd`. For a detailed description of the possible +constraints, please refer to the [Slurm documentation](https://slurm.schedmd.com/srun.html#OPT_constraint). !!! hint A feature is checked only for scheduling. Running jobs are not affected by changing features. -### Available Features - -| Feature | Description | -|:--------|:-------------------------------------------------------------------------| -| DA | Subset of Haswell nodes with a high bandwidth to NVMe storage (island 6) | - -#### Filesystem Features +### Filesystem Features A feature `fs_*` is active if a certain filesystem is mounted and available on a node. Access to -these filesystems are tested every few minutes on each node and the Slurm features set accordingly. +these filesystems are tested every few minutes on each node and the Slurm features are set accordingly. | Feature | Description | [Workspace Name](../data_lifecycle/workspaces.md#extension-of-a-workspace) | |:---------------------|:-------------------------------------------------------------------|:---------------------------------------------------------------------------| diff --git a/doc.zih.tu-dresden.de/docs/jobs_and_resources/slurm_examples.md b/doc.zih.tu-dresden.de/docs/jobs_and_resources/slurm_examples.md index 703cae5642f256bdb124433d4d306537ff80375c..0322c2ce97f2801b4250d409718b36ab9b64f9fb 100644 --- a/doc.zih.tu-dresden.de/docs/jobs_and_resources/slurm_examples.md +++ b/doc.zih.tu-dresden.de/docs/jobs_and_resources/slurm_examples.md @@ -186,7 +186,7 @@ When `srun` is used within a submission script, it inherits parameters from `sba `--ntasks=1`, `--cpus-per-task=4`, etc. So we actually implicitly run the following ```bash -srun --ntasks=1 --cpus-per-task=4 ... --partition=ml some-gpu-application +srun --ntasks=1 --cpus-per-task=4 [...] --partition=ml <some-gpu-application> ``` Now, our goal is to run four instances of this program concurrently in a single batch script. Of @@ -237,7 +237,7 @@ inherited from the surrounding `sbatch` context. The following line would be suf job in this example: ```bash -srun --exclusive --gres=gpu:1 --ntasks=1 some-gpu-application & +srun --exclusive --gres=gpu:1 --ntasks=1 <some-gpu-application> & ``` Yet, it adds some extra safety to leave them in, enabling the Slurm batch system to complain if not @@ -278,7 +278,8 @@ use up all resources in the nodes: #SBATCH --exclusive # ensure that nobody spoils my measurement on 2 x 2 x 8 cores #SBATCH --time=00:10:00 #SBATCH --job-name=Benchmark - #SBATCH --mail-user=your.name@tu-dresden.de + #SBATCH --mail-type=end + #SBATCH --mail-user=<your.email>@tu-dresden.de srun ./my_benchmark ``` @@ -313,14 +314,14 @@ name specific to the job: ```Bash #!/bin/bash - #SBATCH --array 0-9 + #SBATCH --array=0-9 #SBATCH --output=arraytest-%A_%a.out #SBATCH --error=arraytest-%A_%a.err #SBATCH --ntasks=864 #SBATCH --time=08:00:00 #SBATCH --job-name=Science1 #SBATCH --mail-type=end - #SBATCH --mail-user=your.name@tu-dresden.de + #SBATCH --mail-user=<your.email>@tu-dresden.de echo "Hi, I am step $SLURM_ARRAY_TASK_ID in this array job $SLURM_ARRAY_JOB_ID" ``` @@ -338,36 +339,84 @@ Please read the Slurm documentation at https://slurm.schedmd.com/sbatch.html for ## Chain Jobs -You can use chain jobs to create dependencies between jobs. This is often the case if a job relies -on the result of one or more preceding jobs. Chain jobs can also be used if the runtime limit of the -batch queues is not sufficient for your job. Slurm has an option +You can use chain jobs to **create dependencies between jobs**. This is often useful if a job +relies on the result of one or more preceding jobs. Chain jobs can also be used to split a long +runnning job exceeding the batch queues limits into parts and chain these parts. Slurm has an option `-d, --dependency=<dependency_list>` that allows to specify that a job is only allowed to start if another job finished. -Here is an example of how a chain job can look like, the example submits 4 jobs (described in a job -file) that will be executed one after each other with different CPU numbers: +In the following we provide two examples for scripts that submit chain jobs. -!!! example "Script to submit jobs with dependencies" +??? example "Scaling experiment using chain jobs" - ```Bash + This scripts submits the very same job file `myjob.sh` four times, which will be executed one + after each other. The number of tasks is increased from job to job making this submit script a + good starting point for (strong) scaling experiments. + + ```Bash title="submit_scaling.sh" #!/bin/bash - TASK_NUMBERS="1 2 4 8" - DEPENDENCY="" - JOB_FILE="myjob.slurm" - - for TASKS in $TASK_NUMBERS ; do - JOB_CMD="sbatch --ntasks=$TASKS" - if [ -n "$DEPENDENCY" ] ; then - JOB_CMD="$JOB_CMD --dependency afterany:$DEPENDENCY" + + task_numbers="1 2 4 8" + dependency="" + job_file="myjob.sh" + + for tasks in ${task_numbers} ; do + job_cmd="sbatch --ntasks=${tasks}" + if [ -n "${dependency}" ] ; then + job_cmd="${job_cmd} --dependency=afterany:${dependency}" fi - JOB_CMD="$JOB_CMD $JOB_FILE" - echo -n "Running command: $JOB_CMD " - OUT=`$JOB_CMD` - echo "Result: $OUT" - DEPENDENCY=`echo $OUT | awk '{print $4}'` + job_cmd="${job_cmd} ${job_file}" + echo -n "Running command: ${job_cmd} " + out="$(${job_cmd})" + echo "Result: ${out}" + dependency=$(echo "${out}" | awk '{print $4}') + done + ``` + + The output looks like: + ```console + marie@login$ sh submit_scaling.sh + Running command: sbatch --ntasks=1 myjob.sh Result: Submitted batch job 2963822 + Running command: sbatch --ntasks=2 --dependency afterany:32963822 myjob.sh Result: Submitted batch job 2963823 + Running command: sbatch --ntasks=4 --dependency afterany:32963823 myjob.sh Result: Submitted batch job 2963824 + Running command: sbatch --ntasks=8 --dependency afterany:32963824 myjob.sh Result: Submitted batch job 2963825 + ``` + +??? example "Example to submit job chain via script" + + This script submits three different job files, which will be executed one after each other. Of + course, the dependency reasons can be adopted. + + ```bash title="submit_job_chain.sh" + #!/bin/bash + + declare -a job_names=("jobfile_a.sh" "jobfile_b.sh" "jobfile_c.sh") + dependency="" + arraylength=${#job_names[@]} + + for (( i=0; i<arraylength; i++ )) ; do + job_nr=$((i + 1)) + echo "Job ${job_nr}/${arraylength}: ${job_names[$i]}" + if [ -n "${dependency}" ] ; then + echo "Dependency: after job ${dependency}" + dependency="--dependency=afterany:${dependency}" + fi + job="sbatch ${dependency} ${job_names[$i]}" + out=$(${job}) + dependency=$(echo "${out}" | awk '{print $4}') done ``` + The output looks like: + ```console + marie@login$ sh submit_job_chains.sh + Job 1/3: jobfile_a.sh + Job 2/3: jobfile_b.sh + Dependency: after job 2963708 + Job 3/3: jobfile_c.sh + Dependency: after job 2963709 + ``` + ## Array-Job with Afterok-Dependency and Datamover Usage In this example scenario, imagine you need to move data, before starting the main job. diff --git a/doc.zih.tu-dresden.de/docs/software/custom_easy_build_environment.md b/doc.zih.tu-dresden.de/docs/software/custom_easy_build_environment.md index 9232e7472e8acc0254f876352310be0355d9aa4e..15063e28c0d378c0a64c3f4bf86cd85190e2605c 100644 --- a/doc.zih.tu-dresden.de/docs/software/custom_easy_build_environment.md +++ b/doc.zih.tu-dresden.de/docs/software/custom_easy_build_environment.md @@ -1,4 +1,4 @@ -# Software Install with EasyBuild +# Software Installation with EasyBuild Sometimes the [modules](modules.md) installed in the cluster are not enough for your purposes and you need some other software or a different version of a software. diff --git a/doc.zih.tu-dresden.de/docs/software/energy_measurement.md b/doc.zih.tu-dresden.de/docs/software/energy_measurement.md index ac73235a27fefc8ea6dffb934b4439391a32cfff..633d3e406d3b770d0f55220b671bab9b4b4a2bb1 100644 --- a/doc.zih.tu-dresden.de/docs/software/energy_measurement.md +++ b/doc.zih.tu-dresden.de/docs/software/energy_measurement.md @@ -1,4 +1,4 @@ -# Energy Measurement Infrastructure +# Measure Energy Consumption The Intel Haswell nodes of ZIH system are equipped with power instrumentation that allow the recording and accounting of power dissipation and energy consumption data. The data is made @@ -58,7 +58,7 @@ the node under test to start, stop, and query the measurement device. !!! note - Please always execute `clearHdeem` before `startHdeem`. + Please always execute `clearHdeem` before `startHdeem`. ## Integration in Application Performance Traces diff --git a/doc.zih.tu-dresden.de/docs/software/gpu_programming.md b/doc.zih.tu-dresden.de/docs/software/gpu_programming.md index af027d612321b45c16b5096b556ff6ed244845c3..2e5b57422a0472650a6fc64c5c4bfeac433e5801 100644 --- a/doc.zih.tu-dresden.de/docs/software/gpu_programming.md +++ b/doc.zih.tu-dresden.de/docs/software/gpu_programming.md @@ -200,7 +200,15 @@ detail in [nvcc documentation](https://docs.nvidia.com/cuda/cuda-compiler-driver This compiler is available via several `CUDA` packages, a default version can be loaded via `module load CUDA`. Additionally, the `NVHPC` modules provide CUDA tools as well. -#### Usage of the CUDA compiler +For using CUDA with OpenMPI at multiple nodes, the OpenMPI module loaded shall have be compiled with +CUDA support. If you aren't sure if the module you are using has support for it you can check it as +following: + +```console +ompi_info --parsable --all | grep mpi_built_with_cuda_support:value | awk -F":" '{print "OpenMPI supports CUDA:",$7}' +``` + +#### Usage of the CUDA Compiler The simple invocation `nvcc <code.cu>` will compile a valid CUDA program. `nvcc` differentiates between the device and the host code, which will be compiled in separate phases. Therefore, compiler diff --git a/doc.zih.tu-dresden.de/docs/software/lo2s.md b/doc.zih.tu-dresden.de/docs/software/lo2s.md index cf34feccfca15e1e37d5278f30117aaba827e800..1183e83022396fb76b2a8c455a1c870022a06588 100644 --- a/doc.zih.tu-dresden.de/docs/software/lo2s.md +++ b/doc.zih.tu-dresden.de/docs/software/lo2s.md @@ -1,7 +1,7 @@ -# lo2s - Lightweight Node-Level Performance Monitoring +# Record Course of Events with lo2s -`lo2s` creates parallel OTF2 traces with a focus on both application and system view. -The traces can contain any of the following information: +Lightweight node-level performance monitoring tool `lo2s` creates parallel OTF2 traces with a focus +on both application and system view. The traces can contain any of the following information: * From running threads * Calling context samples based on instruction overflows diff --git a/doc.zih.tu-dresden.de/docs/software/mathematics.md b/doc.zih.tu-dresden.de/docs/software/mathematics.md index d28c6eae651e4a9d7d9b6190c4768ff16e1e4cff..8562135e253ecb578ffb6b264d6a505965a9252a 100644 --- a/doc.zih.tu-dresden.de/docs/software/mathematics.md +++ b/doc.zih.tu-dresden.de/docs/software/mathematics.md @@ -562,3 +562,7 @@ To learn more about the MATLAB Parallel Computing Toolbox, check out these resou Tutorials](http://www.mathworks.com/products/parallel-computing/tutorials.html) * [Parallel Computing Videos](http://www.mathworks.com/products/parallel-computing/videos.html) * [Parallel Computing Webinars](http://www.mathworks.com/products/parallel-computing/webinars.html) +* [MATLAB NHR Tutorial Slides: Parallel Computing with MATLAB](https://event.zih.tu-dresden.de/nhr/matlab/module1/materials) +* [MATLAB NHR Tutorial Slides: Machine Learning with MATLAB](https://event.zih.tu-dresden.de/nhr/matlab/module2/materials) +* [MATLAB NHR Tutorial Slides: Deep Learning with MATLAB](https://event.zih.tu-dresden.de/nhr/matlab/module3/materials) +* [MATLAB NHR Tutorial Slides: Interoperability of MATLAB and Python](https://event.zih.tu-dresden.de/nhr/matlab/module4/materials) diff --git a/doc.zih.tu-dresden.de/docs/software/misc/must-error-01.png b/doc.zih.tu-dresden.de/docs/software/misc/must-error-01.png new file mode 100644 index 0000000000000000000000000000000000000000..d3f6fe02a9744724bd2084b75a5b8415eb41342c Binary files /dev/null and b/doc.zih.tu-dresden.de/docs/software/misc/must-error-01.png differ diff --git a/doc.zih.tu-dresden.de/docs/software/misc/must-error-02.png b/doc.zih.tu-dresden.de/docs/software/misc/must-error-02.png new file mode 100644 index 0000000000000000000000000000000000000000..fc91e2a5d4f81908474a7f60e2c457861a9ed311 Binary files /dev/null and b/doc.zih.tu-dresden.de/docs/software/misc/must-error-02.png differ diff --git a/doc.zih.tu-dresden.de/docs/software/misc/pika_cpu_idle.png b/doc.zih.tu-dresden.de/docs/software/misc/pika_cpu_idle.png new file mode 100644 index 0000000000000000000000000000000000000000..a9e499af841d2f01f33f94757955285700d04dae Binary files /dev/null and b/doc.zih.tu-dresden.de/docs/software/misc/pika_cpu_idle.png differ diff --git a/doc.zih.tu-dresden.de/docs/software/misc/pika_footprint.png b/doc.zih.tu-dresden.de/docs/software/misc/pika_footprint.png new file mode 100644 index 0000000000000000000000000000000000000000..ef98039c2aaa6ae6e2e9291be3f2b05e5039ef78 Binary files /dev/null and b/doc.zih.tu-dresden.de/docs/software/misc/pika_footprint.png differ diff --git a/doc.zih.tu-dresden.de/docs/software/misc/pika_io_block.png b/doc.zih.tu-dresden.de/docs/software/misc/pika_io_block.png new file mode 100644 index 0000000000000000000000000000000000000000..5b3cb64c577ed533e8d3e01cb1fe247037ea53ca Binary files /dev/null and b/doc.zih.tu-dresden.de/docs/software/misc/pika_io_block.png differ diff --git a/doc.zih.tu-dresden.de/docs/software/misc/pika_mem_leak.png b/doc.zih.tu-dresden.de/docs/software/misc/pika_mem_leak.png new file mode 100644 index 0000000000000000000000000000000000000000..dea70e4cdbd580ab609ca160389adbe4d635a6f0 Binary files /dev/null and b/doc.zih.tu-dresden.de/docs/software/misc/pika_mem_leak.png differ diff --git a/doc.zih.tu-dresden.de/docs/software/misc/pika_smt_2.png b/doc.zih.tu-dresden.de/docs/software/misc/pika_smt_2.png new file mode 100644 index 0000000000000000000000000000000000000000..71306f4aff11c85a540093808284be733cea3c24 Binary files /dev/null and b/doc.zih.tu-dresden.de/docs/software/misc/pika_smt_2.png differ diff --git a/doc.zih.tu-dresden.de/docs/software/misc/pika_timelines.png b/doc.zih.tu-dresden.de/docs/software/misc/pika_timelines.png new file mode 100644 index 0000000000000000000000000000000000000000..3b4bf2c451796809a80a16b7773fcdfc6ea9d651 Binary files /dev/null and b/doc.zih.tu-dresden.de/docs/software/misc/pika_timelines.png differ diff --git a/doc.zih.tu-dresden.de/docs/software/misc/spec_gnu-taurus.cfg b/doc.zih.tu-dresden.de/docs/software/misc/spec_gnu-taurus.cfg new file mode 100644 index 0000000000000000000000000000000000000000..42879eeffe242ddf2983809fec61713f86eafbc8 --- /dev/null +++ b/doc.zih.tu-dresden.de/docs/software/misc/spec_gnu-taurus.cfg @@ -0,0 +1,206 @@ +####################################################################### +# Example configuration file for the GNU Compilers +# +# Defines: "model" => "mpi", "acc", "omp", "tgt", "tgtgpu" default "mpi" +# "label" => ext base label, default "nv" +# +# MPI-only Command: +# runhpc -c Example_gnu --reportable -T base --define model=mpi --ranks=40 small +# +# OpenACC Command: +# runhpc -c Example_gnu --reportable -T base --define model=acc --ranks=4 small +# +# OpenMP Command: +# runhpc -c Example_gnu --reportable -T base --define model=omp --ranks=1 --threads=40 small +# +# OpenMP Target Offload to Host Command: +# runhpc -c Example_gnu --reportable -T base --define model=tgt --ranks=1 --threads=40 small +# +# OpenMP Target Offload to NVIDIA GPU Command: +# runhpc -c Example_gnu --reportable -T base --define model=tgtnv --ranks=4 small +# +####################################################################### + +%ifndef %{label} # IF label is not set use gnu +% define label gnu +%endif + +%ifndef %{model} # IF model is not set use mpi +% define model mpi +%endif + +teeout = yes +makeflags=-j 24 + +# Tester Information +license_num = 37A +tester = Technische Universitaet Dresden +test_sponsor = Technische Universitaet Dresden + +###################################################### +# SUT Section +###################################################### +#include: Example_SUT.inc +include: sut-taurus.inc + +#[Software] +sw_compiler000 = C/C++/Fortran: Version 8.2.0 of +sw_compiler001 = GNU Compilers +sw_mpi_library = OpenMPI Version 3.1.3 +sw_mpi_other = None +sw_other = None + +#[General notes] +notes_000 = MPI startup command: +notes_005 = slurm srun command was used to start MPI jobs. + +####################################################################### +# End of SUT section +####################################################################### + +####################################################################### +# The header section of the config file. Must appear +# before any instances of "section markers" (see below) +# +# ext = how the binaries you generated will be identified +# tune = specify "base" or "peak" or "all" +%ifndef %{tudprof} +label = %{label}_%{model} +%else +label = %{label}_%{model}_%{tudprof} +%endif + +tune = base +output_format = text +use_submit_for_speed = 1 + +# Compiler Settings +default: +CC = mpicc +CXX = mpicxx +FC = mpif90 +%if %{tudprof} eq 'scorep' +CC = scorep --mpp=mpi --instrument-filter=${SPEC}/scorep.filter mpicc +CXX = scorep --mpp=mpi --instrument-filter=${SPEC}/scorep.filter mpicxx +FC = scorep --mpp=mpi --instrument-filter=${SPEC}/scorep.filter mpif90 +%endif + + +# Compiler Version Flags +CC_VERSION_OPTION = --version +CXX_VERSION_OPTION = --version +FC_VERSION_OPTION = --version + +# enable non-official patches to this kit +#strict_rundir_verify = 0 + +# MPI options and binding environment, dependent upon Model being run +# Adjust to match your system + +# OpenMP (CPU) Settings +%if %{model} eq 'omp' +preENV_OMP_PROC_BIND=true +preENV_OMP_PLACES=cores +%endif + +#OpenMP Targeting Host Settings +%if %{model} eq 'tgt' +preENV_OMP_PROC_BIND=true +preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39 +%endif + +#MPIRUN_OPTS = --bind-to none -q +MPIRUN_OPTS= +submit = timeout 2h srun ${MPIRUN_OPTS} -n $ranks -c $threads $command + +# MPI Workaround for mca issues in sph_exa +#preOMPI_MCA_topo=basic + +# Score-P performance profiling +%if %{tudprof} eq 'scorep' + +## only record calls to main and MPI functions (runtime filtering) +## runtime filtering was replaced by compile-time filtering (see above) +# preENV_SCOREP_FILTERING_FILE=/home/brunst/ws-hpc2020/kit91/scorep.filter + +## set buffer memory size for profile/trace +preENV_SCOREP_TOTAL_MEMORY=64MB + +## enable profile recording +preENV_SCOREP_ENABLE_PROFILING=true + +## set to 'true' to enable detailed trace file recording +preENV_SCOREP_ENABLE_TRACING=false + +## collect memory consumption per node +preENV_SCOREP_METRIC_RUSAGE=ru_maxrss + +## uncomment to record cycle counter for scheduling analysis +preENV_SCOREP_METRIC_PAPI=PAPI_TOT_CYC + +%endif + + +####################################################################### +# Optimization + +# Note that SPEC baseline rules require that all uses of a given compiler +# use the same flags in the same order. See the SPEChpc Run Rules +# for more details +# http://www.spec.org/hpc2021/Docs/runrules.html +# +# OPTIMIZE = flags applicable to all compilers +# FOPTIMIZE = flags appliable to the Fortran compiler +# COPTIMIZE = flags appliable to the C compiler +# CXXOPTIMIZE = flags appliable to the C++ compiler +# +# See your compiler manual for information on the flags available +# for your compiler + +# Compiler flags applied to all models +default=base=default: +COPTIMIZE = -Ofast -march=native -lm # use -mcpu=native for ARM +CXXOPTIMIZE = -Ofast -march=native -std=c++14 +FOPTIMIZE = -Ofast -march=native -fno-stack-protector +FPORTABILITY = -ffree-line-length-none + +%if %{model} eq 'mpi' + pmodel=MPI +%endif + +# OpenACC flags +%if %{model} eq 'acc' + pmodel=ACC + OPTIMIZE += -fopenacc -foffload=-lm +%endif + +# OpenMP (CPU) flags +%if %{model} eq 'omp' + pmodel=OMP + OPTIMIZE += -fopenmp +%endif + +# OpenMP Targeting host flags +%if %{model} eq 'tgt' + pmodel=TGT + OPTIMIZE += -fopenmp +%endif + +# OpenMP Targeting Nvidia GPU flags +%if %{model} eq 'tgtnv' + pmodel=TGT + OPTIMIZE += -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda +%endif + +# No peak flags set, so make peak use the same flags as base +default=peak=default: +basepeak=1 + +####################################################################### +# Portability +####################################################################### + + +519.clvleaf_t,619.clvleaf_s,719.clvleaf_m,819.clvleaf_l=default=default: +# Not needed anymore? +#PORTABILITY += -DSPEC_GNU_FLUSH diff --git a/doc.zih.tu-dresden.de/docs/software/misc/spec_nvhpc-alpha.cfg b/doc.zih.tu-dresden.de/docs/software/misc/spec_nvhpc-alpha.cfg new file mode 100644 index 0000000000000000000000000000000000000000..18743ba58e98e299227fc0273cf301b52330ed4c --- /dev/null +++ b/doc.zih.tu-dresden.de/docs/software/misc/spec_nvhpc-alpha.cfg @@ -0,0 +1,287 @@ +# Invocation command line: +# runhpc --config nvhpc_alpha.cfg --ranks 8 --rebuild --define pmodel=acc --noreportable --tune=base --iterations=1 small +####################################################################### +# Example configuration file for the GNU Compilers +# +# Defines: "pmodel" => "mpi", "acc", "omp", "tgt", "tgtgpu" default "mpi" +# "label" => ext base label, default "nv" +# +# MPI-only Command: +# runhpc -c Example_gnu --reportable -T base --define pmodel=mpi --ranks=40 small +# +# OpenACC Command: +# runhpc -c Example_gnu --reportable -T base --define pmodel=acc --ranks=4 small +# +# OpenMP Command: +# runhpc -c Example_gnu --reportable -T base --define pmodel=omp --ranks=1 --threads=40 small +# +# OpenMP Target Offload to Host Command: +# runhpc -c Example_gnu --reportable -T base --define pmodel=tgt --ranks=1 --threads=40 small +# +# OpenMP Target Offload to NVIDIA GPU Command: +# runhpc -c Example_gnu --reportable -T base --define pmodel=tgtnv --ranks=4 small +# +####################################################################### + +%ifndef %{label} # IF label is not set use gnu +% define label nv +%endif + +%ifndef %{pmodel} # IF pmodel is not set use mpi +% define pmodel mpi +%endif + +teeout = yes +makeflags=-j 40 + +# Tester Information +license_num = 37A +test_sponsor = TU Dresden +tester = TU Dresden + + +####################################################################### +# SUT Section +####################################################################### + +# General SUT info +system_vendor = AMD +system_name = Alpha Centauri: AMD EPYC 7352 (AMD x86_64, NVIDIA A100-SXM4-40GB) +hw_avail = Jan-2019 +sw_avail = Aug-2022 + +#[Node_Description: Hardware] +node_compute_syslbl = AMD Rome +node_compute_order = 1 +node_compute_count = 34 +node_compute_purpose = compute +node_compute_hw_vendor = AMD +node_compute_hw_model = AMD K17 (Zen2) +node_compute_hw_cpu_name = AMD EPYC 7352 +node_compute_hw_ncpuorder = 2 chips +node_compute_hw_nchips = 2 +node_compute_hw_ncores = 96 +node_compute_hw_ncoresperchip = 48 +node_compute_hw_nthreadspercore = 2 +node_compute_hw_cpu_char = Up to 2.3 GHz +node_compute_hw_cpu_mhz = 2100 +node_compute_hw_pcache = 32 KB I + 32 KB D on chip per core +node_compute_hw_scache = 512 KB I+D on chip per core +node_compute_hw_tcache000= 16384 KB I+D on chip per chip +node_compute_hw_ocache = None +node_compute_hw_memory = 1 TB +node_compute_hw_disk000= 3.5 TB +node_compute_hw_disk001 = NVMe SSD Controller SM981/PM981/PM983 +node_compute_hw_adapter_ib_model = Mellanox ConnectX-6 +node_compute_hw_adapter_ib_interconnect = EDR InfiniBand +node_compute_hw_adapter_ib_firmware = 20.28.2006 +node_compute_hw_adapter_ib_driver = mlx5_core +node_compute_hw_adapter_ib_data_rate = 200 Gb/s +node_compute_hw_adapter_ib_count = 2 +node_compute_hw_adapter_ib_slot_type = PCIe +node_compute_hw_adapter_ib_ports_used = 2 +node_compute_hw_other = None + +#[Node_Description: Accelerator] +node_compute_hw_accel_model = Tesla A100-SXM4-40GB +node_compute_hw_accel_count = 8 +node_compute_hw_accel_vendor = NVIDIA Corporation +node_compute_sw_accel_driver = NVIDIA CUDA 470.57.02 +node_compute_hw_accel_type = GPU +node_compute_hw_accel_connect = ASPEED Technology, Inc. (rev 04) +node_compute_hw_accel_ecc = Yes +node_compute_hw_accel_desc = none + +#[Node_Description: Software] +node_compute_sw_os000 = CentOS Linux +node_compute_sw_os001 = 7 +node_compute_sw_localfile = xfs +node_compute_sw_sharedfile000 = 4 PB Lustre parallel filesystem +node_compute_sw_sharedfile001 = over 4X EDR InfiniBand +node_compute_sw_state = Multi-user +node_compute_sw_other = None + +#[Fileserver] + +#[Interconnect] +interconnect_ib_syslbl = Mellanox InfiniBand +interconnect_ib_purpose = MPI Traffic and GPFS access +interconnect_ib_order = 1 +interconnect_ib_hw_vendor = Mellanox +interconnect_ib_hw_topo = Non-blocking Fat-tree +#interconnect_ib_hw_switch_ib_count = 2 +#interconnect_ib_hw_switch_ib_ports = 2 +#interconnect_ib_hw_switch_ib_data_rate = 100 Gb/s +#interconnect_ib_hw_switch_ib_model = Mellanox Switch IB-2 + +#[Software] +sw_compiler000 = C/C++/Fortran: Version 21.7 of the +sw_compiler001 = NVHPC toolkit +sw_mpi_library = Open MPI Version 4.1.1 +sw_mpi_other = None +system_class = Homogenous Cluster +sw_other = CUDA Driver Version: 11.4.2 + +####################################################################### +# End of SUT section +####################################################################### + + +####################################################################### +# The header section of the config file. Must appear +# before any instances of "section markers" (see below) +# +# ext = how the binaries you generated will be identified +# tune = specify "base" or "peak" or "all" +label = %{label}_%{pmodel} +tune = base +output_format = text +use_submit_for_speed = 1 + +# Compiler Settings +default: +CC = mpicc +CXX = mpicxx +FC = mpif90 +# Compiler Version Flags +CC_VERSION_OPTION = --version +CXX_VERSION_OPTION = --version +FC_VERSION_OPTION = --version + +# MPI options and binding environment, dependent upon Model being run +# Adjust to match your system + +# OpenMP (CPU) Settings +%if %{pmodel} eq 'omp' +preENV_OMP_PLACES=cores +#preENV_OMP_PROC_BIND=true +#preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39 +#preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,2\ +#3,24 +%endif + +#OpenMP Targeting Host Settings +%if %{pmodel} eq 'tgt' +#preENV_OMP_PROC_BIND=true +preENV_MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 +preEnv_MPICH_GPU_SUPPORT_ENABLED=1 +preEnv_MPICH_SMP_SINGLE_COPY_MODE=CMA +preEnv_MPICH_GPU_EAGER_DEVICE_MEM=0 +#preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39 +#preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24 +%endif + +%ifdef %{ucx} +# if using OpenMPI with UCX support, these settings are needed with use of CUDA Aware MPI +# without these flags, LBM is known to hang when using OpenACC and OpenMP Target to GPUs +preENV_UCX_MEMTYPE_CACHE=n +preENV_UCX_TLS=self,shm,cuda_copy +%endif + +#MPIRUN_OPTS = --bind-to none -q +#submit = mpirun ${MPIRUN_OPTS} -n $ranks $command +submit = srun $command + +####################################################################### +# Optimization + +# Note that SPEC baseline rules require that all uses of a given compiler +# use the same flags in the same order. See the SPEChpc Run Rules +# for more details +# http://www.spec.org/hpc2021/Docs/runrules.html +# +# OPTIMIZE = flags applicable to all compilers +# FOPTIMIZE = flags appliable to the Fortran compiler +# COPTIMIZE = flags appliable to the C compiler +# CXXOPTIMIZE = flags appliable to the C++ compiler +# +# See your compiler manual for information on the flags available +# for your compiler + +# Compiler flags applied to all models +default=base=default: +#OPTIMIZE = -w -Mfprelaxed -Mnouniform -Mstack_arrays -fast +OPTIMIZE = -w -O3 -Mfprelaxed -Mnouniform -Mstack_arrays +COPTIMIZE = -lm # use -mcpu=native for ARM +CXXOPTIMIZE = -std=c++11 +CXXPORTABILITY = --c++17 + +#ARM +%if %{armOn} eq 'arm' + COPTIMIZE += -mcpu=native + #OPTIMIZE += -mcpu=a64fx +%endif + +# SVE +%if %{sveOn} eq 'sve' + COPTIMIZE += -march=armv8-a+sve +%endif + +%if %{model} eq 'mpi' + pmodel=MPI +%endif + +# OpenACC flags +%if %{pmodel} eq 'acc' + pmodel=ACC + # Use with PGI compiler only + # https://docs.nvidia.com/hpc-sdk/archive/21.7/ + #OPTIMIZE += -acc=gpu + OPTIMIZE += -acc -ta=tesla -tp=zen #-Minfo=accel #-DSPEC_ACCEL_AWARE_MPI->hangs it forever + +# 513.soma_t: +# PORTABILITY+=-DSPEC_NO_VAR_ARRAY_REDUCE +%endif + +# OpenMP (CPU) flags +%if %{pmodel} eq 'omp' + pmodel=OMP + #OPTIMIZE += -qsmp=omp + OPTIMIZE += -fopenmp + #FOPTIMIZE += +%endif + +# OpenMP Targeting host flags +%if %{pmodel} eq 'tgt' + pmodel=TGT + # PGI + OPTIMIZE += -mp -acc=multicore + # Intel?? + # OPTIMIZE += -qsmp=omp -qoffload + # -fopen-simd + # GCC (doesn't recognize its own flags) + #OPTIMIZE += -fopenmp + #OPTIMIZE += -fopenmp -mgomp + #OPTIMIZE += -fopenmp -msoft-stack -muniform-simt + #FOPTIMIZE += -homp +%endif + +# OpenMP Targeting host flags +%if %{pmodel} eq 'tgtnv' + pmodel=TGT + # PGI + OPTIMIZE += -mp=gpu -acc + #FOPTIMIZE += -homp + + # Note that NVHPC is in the process of adding OpenMP array + # reduction support so this option may be removed in future + 513.soma_t: + PORTABILITY+=-DSPEC_NO_VAR_ARRAY_REDUCE +%endif + +# No peak flags set, so make peak use the same flags as base +default=peak=default: +basepeak=1 + +####################################################################### +# Portability +####################################################################### + + + +# The following section was added automatically, and contains settings that +# did not appear in the original configuration file, but were added to the +# raw file after the run. +default: +flagsurl000 = http://www.spec.org/hpc2021/flags/nv2021_flags.xml +interconnect_ib_hw_switch_ib_model000 = Mellanox IB EDR Switch IB-2 diff --git a/doc.zih.tu-dresden.de/docs/software/misc/spec_nvhpc-ppc.cfg b/doc.zih.tu-dresden.de/docs/software/misc/spec_nvhpc-ppc.cfg new file mode 100644 index 0000000000000000000000000000000000000000..06b9e1b85549892df1880e9ae2c461276ac95a2d --- /dev/null +++ b/doc.zih.tu-dresden.de/docs/software/misc/spec_nvhpc-ppc.cfg @@ -0,0 +1,322 @@ +# Invocation command line: +# runhpc --config nvhpc_ppc --define pmodel=acc --action run --nobuild --ranks=6 --reportable tiny +# output_root was not used for this run +####################################################################### +# Example configuration file for the GNU Compilers +# +# Defines: "pmodel" => "mpi", "acc", "omp", "tgt", "tgtnv" default "mpi" +# "label" => ext base label, default "nv" +# +# MPI-only Command: +# runhpc -c Example_gnu --reportable -T base --define pmodel=mpi --ranks=40 small +# +# OpenACC Command: +# runhpc -c Example_gnu --reportable -T base --define pmodel=acc --ranks=4 small +# +# OpenMP Command: +# runhpc -c Example_gnu --reportable -T base --define pmodel=omp --ranks=1 --threads=40 small +# +# OpenMP Target Offload to Host Command: +# runhpc -c Example_gnu --reportable -T base --define pmodel=tgt --ranks=1 --threads=40 small +# +# OpenMP Target Offload to NVIDIA GPU Command: +# runhpc -c Example_gnu --reportable -T base --define pmodel=tgtnv --ranks=4 small +# +####################################################################### + +%ifndef %{label} # IF label is not set use default "nv" +% define label nv +%endif + +%ifndef %{pmodel} # IF pmodel is not set use default mpi +% define pmodel mpi +%endif + +teeout = yes +makeflags=-j 40 + +# Tester Information +license_num = 37A +test_sponsor = TU Dresden +tester = TU Dresden + +####################################################################### +# SUT Section +####################################################################### +#include: Example_SUT.inc +# ----- Begin inclusion of 'Example_SUT.inc' +####################################################################### +# General SUT info +system_vendor = IBM +system_name = Taurus: IBM Power System AC922 (IBM Power9, Tesla V100-SXM2-32GB) +node_compute_sw_accel_driver = NVIDIA CUDA 440.64.00 +node_compute_hw_adapter_ib_slot_type = None +node_compute_hw_adapter_ib_ports_used = 2 +node_compute_hw_adapter_ib_model = Mellanox ConnectX-5 +node_compute_hw_adapter_ib_interconnect = EDR InfiniBand +node_compute_hw_adapter_ib_firmware = 16.27.6008 +node_compute_hw_adapter_ib_driver = mlx5_core +node_compute_hw_adapter_ib_data_rate = 100 Gb/s (4X EDR) +node_compute_hw_adapter_ib_count = 2 +interconnect_ib_syslbl = Mellanox InfiniBand +interconnect_ib_purpose = MPI Traffic and GPFS access +interconnect_ib_order = 1 +#interconnect_ib_hw_vendor = Mellanox +#interconnect_ib_hw_topo = Non-blocking Fat-tree +#interconnect_ib_hw_switch_ib_ports = 36 +#interconnect_ib_hw_switch_ib_data_rate = 100 Gb/s +#interconnect_ib_hw_switch_ib_count = 1 +#interconnect_ib_hw_model = Mellanox Switch IB-2 +hw_avail = Nov-2018 +sw_avail = Nov-2021 +prepared_by = Noah Trumpik (Noah.Trumpik@tu-dresden.de) + +#[Node_Description: Hardware] +node_compute_syslbl = IBM Power System AC922 +node_compute_order = 1 +node_compute_count = 30 +node_compute_purpose = compute +node_compute_hw_vendor = IBM +node_compute_hw_model = IBM Power System AC922 +node_compute_hw_cpu_name = IBM POWER9 2.2 (pvr 004e 1202) +node_compute_hw_ncpuorder = 2 chips +node_compute_hw_nchips = 2 +node_compute_hw_ncores = 44 +node_compute_hw_ncoresperchip = 22 +node_compute_hw_nthreadspercore = 4 +node_compute_hw_cpu_char = Up to 3.8 GHz +node_compute_hw_cpu_mhz = 2300 +node_compute_hw_pcache = 32 KB I + 32 KB D on chip per core +node_compute_hw_scache = 512 KB I+D on chip per core +node_compute_hw_tcache000= 10240 KB I+D on chip per chip +node_compute_hw_ocache = None +node_compute_hw_memory = 256 GB (16 x 16 GB RDIMM-DDR4-2666) +node_compute_hw_disk000= 2 x 1 TB (ATA Rev BE35) +node_compute_hw_disk001 = NVMe SSD Controller 172Xa/172Xb +node_compute_hw_other = None + +#[Node_Description: Accelerator] +node_compute_hw_accel_model = Tesla V100-SXM2-32GB +node_compute_hw_accel_count = 6 +node_compute_hw_accel_vendor= NVIDIA Corporation +node_compute_hw_accel_type = GPU +node_compute_hw_accel_connect = NVLINK +node_compute_hw_accel_ecc = Yes +node_compute_hw_accel_desc = See Notes + +#[Node_Description: Software] +node_compute_sw_os000 = Red Hat Enterprise Linux +node_compute_sw_os001 = 7.6 +node_compute_sw_localfile = xfs +node_compute_sw_sharedfile = 4 PB Lustre parallel filesystem +node_compute_sw_state = Multi-user +node_compute_sw_other = None + +#[Fileserver] + +#[Interconnect] + +#[Software] +sw_compiler000 = C/C++/Fortran: Version 21.5 of the +sw_compiler001 = NVHPC toolkit +sw_mpi_library = Open MPI Version 4.1.2 +sw_mpi_other = None +system_class = Homogenous Cluster +sw_other = None + +#[General notes] +notes_000 = MPI startup command: +notes_005 = srun command was used to launch job using 1 GPU/rank. +notes_010 =Detailed information from nvaccelinfo +notes_015 = +notes_020 =CUDA Driver Version: 11000 +notes_025 =NVRM version: NVIDIA UNIX ppc64le Kernel Module 440.64.00 Wed Feb 26 16:01:28 UTC 2020 +notes_030 = +notes_035 =Device Number: 0 +notes_040 =Device Name: Tesla V100-SXM2-32GB +notes_045 =Device Revision Number: 7.0 +notes_050 =Global Memory Size: 33822867456 +notes_055 =Number of Multiprocessors: 80 +notes_060 =Concurrent Copy and Execution: Yes +notes_065 =Total Constant Memory: 65536 +notes_070 =Total Shared Memory per Block: 49152 +notes_075 =Registers per Block: 65536 +notes_080 =Warp Size: 32 +notes_085 =Maximum Threads per Block: 1024 +notes_090 =Maximum Block Dimensions: 1024, 1024, 64 +notes_095 =Maximum Grid Dimensions: 2147483647 x 65535 x 65535 +notes_100 =Maximum Memory Pitch: 2147483647B +notes_105 =Texture Alignment: 512B +notes_110 =Max Clock Rate: 1530 MHz +notes_115 =Execution Timeout: No +notes_120 =Integrated Device: No +notes_125 =Can Map Host Memory: Yes +notes_130 =Compute Mode: default +notes_135 =Concurrent Kernels: Yes +notes_140 =ECC Enabled: Yes +notes_145 =Memory Clock Rate: 877 MHz +notes_150 =Memory Bus Width: 4096 bits +notes_155 =L2 Cache Size: 6291456 bytes +notes_160 =Max Threads Per SMP: 2048 +notes_165 =Async Engines: 4 +notes_170 =Unified Addressing: Yes +notes_175 =Managed Memory: Yes +notes_180 =Concurrent Managed Memory: Yes +notes_185 =Preemption Supported: Yes +notes_190 =Cooperative Launch: Yes +notes_195 = Multi-Device: Yes +notes_200 =Default Target: cc70 +notes_205 = + +####################################################################### +# End of SUT section +####################################################################### + +####################################################################### +# The header section of the config file. Must appear +# before any instances of "section markers" (see below) +# +# ext = how the binaries you generated will be identified +# tune = specify "base" or "peak" or "all" +label = %{label}_%{pmodel} +tune = base +output_format = text +use_submit_for_speed = 1 + +# Compiler Settings +default: +CC = mpicc +CXX = mpic++ +FC = mpif90 +# Compiler Version Flags +CC_VERSION_OPTION = --version +CXX_VERSION_OPTION = --version +FC_VERSION_OPTION = --version + +# MPI options and binding environment, dependent upon Model being run +# Adjust to match your system + +# OpenMP (CPU) Settings +%if %{pmodel} eq 'omp' +preENV_OMP_PLACES=cores +#preENV_OMP_PROC_BIND=true +#preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39 +#preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,2\ +#3,24 +%endif + +#OpenMP Targeting Host Settings +%if %{pmodel} eq 'tgt' +#preENV_OMP_PROC_BIND=true +preENV_MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 +preEnv_MPICH_GPU_SUPPORT_ENABLED=1 +preEnv_MPICH_SMP_SINGLE_COPY_MODE=CMA +preEnv_MPICH_GPU_EAGER_DEVICE_MEM=0 +#preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39 +#preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24 +%endif + +%ifdef %{ucx} +# if using OpenMPI with UCX support, these settings are needed with use of CUDA Aware MPI +# without these flags, LBM is known to hang when using OpenACC and OpenMP Target to GPUs +preENV_UCX_MEMTYPE_CACHE=n +preENV_UCX_TLS=self,shm,cuda_copy +%endif + +#MPIRUN_OPTS = --bind-to none -q +# 1 GPU per rs, 7 cores per RS, 1 MPI task per RS, 6 RS per host +submit = srun ${MPIRUN_OPTS} $command + +####################################################################### +# Optimization + +# Note that SPEC baseline rules require that all uses of a given compiler +# use the same flags in the same order. See the SPEChpc Run Rules +# for more details +# http://www.spec.org/hpc2021/Docs/runrules.html +# +# OPTIMIZE = flags applicable to all compilers +# FOPTIMIZE = flags appliable to the Fortran compiler +# COPTIMIZE = flags appliable to the C compiler +# CXXOPTIMIZE = flags appliable to the C++ compiler +# +# See your compiler manual for information on the flags available +# for your compiler + +# Compiler flags applied to all models +default=base=default: +OPTIMIZE = -O3 +COPTIMIZE = -lm # use -mcpu=native for ARM +CXXOPTIMIZE = -std=c++11 +#FOPTIMIZE = -ffree-line-length-none -fno-stack-protector +FOPTIMIZE = + +%if %{model} eq 'mpi' + pmodel=MPI +%endif + +# OpenACC flags +%if %{pmodel} eq 'acc' + # Use with PGI compiler only + # https://docs.nvidia.com/hpc-sdk/archive/21.5/ + pmodel=ACC + #OPTIMIZE += -acc=gpu + OPTIMIZE += -acc -ta=tesla + OPTIMIZE += -acc -ta=tesla -DSPEC_ACCEL_AWARE_MPI #-Minfo=accel +%endif + +# Note that NVHPC is in the process of adding OpenMP array +# reduction support so this option may be removed in future +# reduction not supported on taurusml due to old driver +513.soma_t: +PORTABILITY+=-DSPEC_NO_VAR_ARRAY_REDUCE +513.soma_s: +PORTABILITY+=-DSPEC_NO_VAR_ARRAY_REDUCE + +# OpenMP (CPU) flags +%if %{pmodel} eq 'omp' + pmodel=OMP + #OPTIMIZE += -qsmp=omp + OPTIMIZE += -fopenmp + #FOPTIMIZE += +%endif + +# OpenMP Targeting host flags +%if %{pmodel} eq 'tgt' + pmodel=TGT + # PGI + OPTIMIZE += -mp -acc=multicore + # Intel?? + # OPTIMIZE += -qsmp=omp -qoffload + # -fopen-simd + # GCC (doesn't recognize its own flags) + #OPTIMIZE += -fopenmp + #OPTIMIZE += -fopenmp -mgomp + #OPTIMIZE += -fopenmp -msoft-stack -muniform-simt + #FOPTIMIZE += -homp +%endif + +# OpenMP Targeting host flags +%if %{pmodel} eq 'tgtnv' + pmodel=TGT + # PGI + OPTIMIZE += -mp=gpu -acc + #FOPTIMIZE += -homp +%endif + +# No peak flags set, so make peak use the same flags as base +default=peak=default: +basepeak=1 + +####################################################################### +# Portability +####################################################################### + + + +# The following section was added automatically, and contains settings that +# did not appear in the original configuration file, but were added to the +# raw file after the run. +default: +flagsurl000 = http://www.spec.org/hpc2021/flags/nv2021_flags.xml +interconnect_ib_hw_switch_ib_model000 = Mellanox IB EDR Switch IB-2 diff --git a/doc.zih.tu-dresden.de/docs/software/mpi_usage_error_detection.md b/doc.zih.tu-dresden.de/docs/software/mpi_usage_error_detection.md index b604bf5398681458ac416336ea7c42a0b3a25b15..630e19e8fe6d7ca70a89175662ab8e79b9adceac 100644 --- a/doc.zih.tu-dresden.de/docs/software/mpi_usage_error_detection.md +++ b/doc.zih.tu-dresden.de/docs/software/mpi_usage_error_detection.md @@ -1,4 +1,4 @@ -# MPI Error Detection +# Check MPI Correctness with MUST MPI as the de-facto standard for parallel applications of the message passing paradigm offers more than one hundred different API calls with complex restrictions. As a result, developing @@ -6,51 +6,67 @@ applications with this interface is error prone and often time consuming. Some u may only manifest on some platforms or some application runs, which further complicates the detection of these errors. Thus, special debugging tools for MPI applications exist that automatically check whether an application conforms to the MPI standard and whether its MPI calls -are safe. At ZIH, we maintain and support MUST for this task, though different types of these tools -exist (see last section). +are safe. At ZIH, we maintain and support **MUST** for this task, though different types of these +tools exist (see last section). ## MUST -MUST checks if your application conforms to the MPI standard and will issue warnings if there are -errors or non-portable constructs. You can apply MUST without modifying your source code, though we -suggest to add the debugging flag "-g" during compilation. +[MUST](https://itc.rwth-aachen.de/must/) checks if your application conforms to the MPI +standard and will issue warnings if there are errors or non-portable constructs. You can apply MUST +without modifying your source code, though we suggest to add the debugging flag `-g` during +compilation. -See also [MUST Introduction Slides](misc/parallel_debugging_must.pdf). +See also [MUST Introduction Slides](misc/parallel_debugging_must.pdf) for a starting point. ### Setup and Modules You need to load a module file in order to use MUST. Each MUST installation uses a specific combination of a compiler and an MPI library, make sure to use a combination that fits your needs. -Right now we only provide a single combination on each system, contact us if you need further +Right now we provide two combinations, [contact us](../support/support.md) if you need further combinations. You can query for the available modules with: ```console marie@login$ module avail must - MUST/1.6.0-rc3-intel-2018a (L) + MUST/1.6.0-rc3-intel-2018a MUST/1.7.2-intel-2020a (D) ``` You can load a MUST module as follows: ```console marie@login$ module load MUST -Module MUST/1.6.0-rc3-intel-2018a and 16 dependencies loaded. +Module MUST/1.7.2-intel-2020a and 16 dependencies loaded. ``` Besides loading a MUST module, no further changes are needed during compilation and linking. ### Running your Application with MUST -In order to run your application with MUST you need to replace the `srun` command with `mustrun`: +In order to launch your application with MUST you need to replace the `srun` command with +`mustrun --must:mpiexec srun --must:np --ntasks`: ```console -marie@login$ mustrun -np <number of MPI processes> ./<your binary> +marie@login$ mustrun --must:mpiexec srun --must:np --ntasks --ntasks <number of MPI processes> ./<your binary> ``` +Besides replacing the `srun` command you need to be aware that **MUST always allocates an extra +process**, i.e. if you issue a +`mustrun --must:mpiexec srun --must:np --ntasks --ntasks 4 ./<your binary>` then +MUST will start **5 processes** instead. This is usually not critical. However, in interactive and +batch jobs **make sure to allocate an extra CPU for this task**. + Suppose your application is called `fancy-program` and is normally run with 4 processes. -The invocation should then be +The MUST workflow should then be ```console -marie@login$ mustrun -np 4 ./fancy-program +marie@login$ module load MUST + +# Compile your application with the debugging flag "-g" on the correct architecture, e.g.: +marie@login$ srun --ntasks 1 --partition <partition> mpicc -g -o fancy-program fancy-program.c + +# Allocate interactive session with 1 extra process for MUST +marie@login$ salloc --ntasks 5 --partition <partition> + +marie@login$ mustrun --must:mpiexec srun --must:np --ntasks --must:stacktrace backward --ntasks 4 ./fancy-program [MUST] MUST configuration ... centralized checks with fall-back application crash handling (very slow) [MUST] Weaver ... success [MUST] Code generation ... success @@ -65,15 +81,23 @@ marie@login$ mustrun -np 4 ./fancy-program [MUST] Execution finished, inspect "/home/marie/MUST_Output.html"! ``` -Besides replacing the `srun` command you need to be aware that **MUST always allocates an extra -process**, i.e. if you issue a `mustrun -np 4 ./a.out` then MUST will start 5 processes instead. -This is usually not critical, however in batch jobs **make sure to allocate an extra CPU for this -task**. +??? hint "Twice `--ntasks`" + + You might wonder about the two `--ntasks` arguments in the above outlined `mustrun` comannd. + Mustrun is able to use invoke another command instead of mpiexec. For ZIH systems, this will be + `srun` (`--must-mpiexec: srun`). Now, you need to specify what argument of the MPI run arguments + holds the number of application processes. For Slurm, it is `--ntasks <N>`. Thus, you need to + specify `--must:np --ntasks --ntasks <N>`. + +With the additional flag `--must:stacktrace backward` you can produce an additional stacktrace +with line number of the error location which allows to pinpoint the error location in your code. +This might slow down code execution slightly. Finally, MUST assumes that your application may crash at any time. To still gather correctness results under this assumption is extremely expensive in terms of performance overheads. Thus, if your application does not crash, you should add `--must:nocrash` to the `mustrun` command to make MUST aware of this knowledge. Overhead is drastically reduced with this switch. +Further details on alternative launch modes are described in the MUST documentation. ### Result Files @@ -81,18 +105,147 @@ After running your application with MUST you will have its output in the working application. The output is named `MUST_Output.html`. Open this files in a browser to analyze the results. The HTML file is color coded: -- Entries in green represent notes and useful information -- Entries in yellow represent warnings -- Entries in red represent errors +- Entries in green represent notes and useful information +- Entries in yellow represent warnings +- Entries in red represent errors + +### Example Usage of MUST + +In this section, we provide a detailed example explaining the usage of MUST. The example is taken +from the [MUST documentation v1.7.2](https://hpc.rwth-aachen.de/must/files/Documentation-1.7.2.pdf). + +??? example "example.c" + + This C programm contains three MPI usage errors. Save it as `example.c`. + + ``` + #include <stdio.h> + #include <mpi.h> + + int main (int argc , char ** argv) { + int rank , + size , + sBuf [ 2 ] = { 1 , 2 } , + rBuf [ 2 ] ; + MPI_Status status ; + MPI_Datatype newType ; + + MPI_Init(&argc ,&argv ) ; + MPI_Comm_rank (MPI_COMM_WORLD, &rank ) ; + MPI_Comm_size (MPI_COMM_WORLD, &size ) ; + + // Enough tasks? + if ( size < 2 ) { + printf("This test needs at least 2 processes ! \n"); + MPI_Finalize(); + return 1 ; + } + + // Say hello + printf("Hello, I am rank %d of %d processes. \n", rank , size); + + //) Create a datatype + MPI_Type_contiguous( 2, MPI_INT, &newType); + MPI_Type_commit(&newType); + + // 2) Use MPI Sendrecv to perform a ring communication + MPI_Sendrecv(sBuf, 1, newType, (rank+1)%size, 123, + rBuf, sizeof(int)*2, MPI_BYTE, (rank=1+size) %size, 123 , MPI_COMM_WORLD, &status ) ; + + // 3) Use MPI Send and MPI Recv to perform a ring communication + MPI_Send(sBuf, 1, newType, (rank+1)%size, 456, MPI_COMM_WORLD); + MPI_Recv(rBuf, sizeof(int)*2, MPI_BYTE, (rank=1+size)%size, 456, MPI_COMM_WORLD, &status); + + // Say bye bye + printf("Signing off, rank %d. \n" , rank); + + MPI_Finalize(); + return 0 ; + } + /*EOF*/ + ``` + +??? example "Compile and execute" + + The first step is to prepare the environment by loading a MUST module. + + ```console + marie@login$ module purge + marie@login$ module load MUST + Module MUST/1.7.2-intel-2020a and 16 dependencies loaded. + ``` + + Now, you compile the `example.c` program using the MPI compiler wrapper. The compiled binary is + called `example`. + + ```console + marie@login$ mpicc example.c -g -o example + ``` + + Finally, you execute the example application on the compute nodes. As you can see, the following + command line will submit a job to the batch system. + + ``` + marie@login $ mustrun --must:mpiexec srun --must:np --ntasks --ntasks 4 --time 00:10:00 example + [MUST] MUST configuration ... centralized checks with fall-back application crash handling (very slow) + [MUST] Information: overwritting old intermediate data in directory "/scratch/ws/0/marie-must/must_temp"! + [MUST] Using prebuilt infrastructure at /sw/installed/MUST/1.7.2-intel-2020a/modules/mode1-layer2 + [MUST] Weaver ... success + [MUST] Generating P^nMPI configuration ... success + [MUST] Search for linked P^nMPI ... not found ... using LD_PRELOAD to load P^nMPI ... success + [MUST] Executing application: + srun: job 32765491 queued and waiting for resources + srun: job 32778008 has been allocated resources + Hello , I am rank 2 of 4 processes. + Hello , I am rank 3 of 4 processes. + Hello , I am rank 0 of 4 processes. + Hello , I am rank 1 of 4 processes. + ============MUST=============== + ERROR: MUST detected a deadlock, detailed information is available in the MUST output file. You should either investigate details with a debugger or abort, the operation of MUST will stop from now. + =============================== + ``` + +??? example "Analysis of MUST output files and MPI usage errors" + + MUST produces an `MUST_Output.html` file and a directory `MUST_Output-files` with additional + html files. Copy the files to your local host, e.g. + + ```console + marie@local$ scp -r taurus.hrsk.tu-dresden.de:/scratch/ws/0/marie-must/{MUST_Output-files,MUST_Output.html} + ``` + + and open the file `MUST_Output.html` using a webbrowser. Alternativly, you can open the html file with a + `firefox` instance on the HPC sytems. This requires to [forward the X11 support via SSH](../access/ssh_login.md#x11-forwarding). + + MUST detects all three MPI usage errors within this example: + + * A type mismatch + * A send-send deadlock + * A leaked datatype + + The type mismatch is reported as follows: + +  + {: align="center" summary="Type mismatch error report from MUST."} + + MUST also offers a detailed page for the type mismatch error. + +  + {: summary="Retrieve job results via GUI using the Job Monitor." align="center"} + + In order not to exceed the scope of this example, we do not explain the MPI usage errors in more + details. Please, feel free to deep-dive into the error description provided in the official + [MUST documentation v1.7.2](https://hpc.rwth-aachen.de/must/files/Documentation-1.7.2.pdf) (Sec. + 4). ## Further MPI Correctness Tools Besides MUST, there exist further MPI correctness tools, these are: -- Marmot (predecessor of MUST) -- MPI checking library of the Intel Trace Collector -- ISP (From Utah) -- Umpire (predecessor of MUST) +- Marmot (predecessor of MUST) +- MPI checking library of the Intel Trace Collector +- ISP (From Utah) +- Umpire (predecessor of MUST) ISP provides a more thorough deadlock detection as it investigates alternative execution paths, however its overhead is drastically higher as a result. Contact our support if you have a specific diff --git a/doc.zih.tu-dresden.de/docs/software/papi.md b/doc.zih.tu-dresden.de/docs/software/papi.md index d8108bba3048da33661e0dd320a2807a0dd001aa..c5f0e7cfaf6260323a8fb572832e9f0a44f792a4 100644 --- a/doc.zih.tu-dresden.de/docs/software/papi.md +++ b/doc.zih.tu-dresden.de/docs/software/papi.md @@ -1,4 +1,4 @@ -# PAPI Library +# Read CPU Performance Counters with PAPI ## Introduction diff --git a/doc.zih.tu-dresden.de/docs/software/perf_tools.md b/doc.zih.tu-dresden.de/docs/software/perf_tools.md index 2db805a12f96e3daad253ea43e5030ad275cfb12..a32abadd94a358e3eb1f9a6c9e364313660f5b5a 100644 --- a/doc.zih.tu-dresden.de/docs/software/perf_tools.md +++ b/doc.zih.tu-dresden.de/docs/software/perf_tools.md @@ -1,4 +1,4 @@ -# Perf Tools +# Produce Performance Overview with Perf The Linux `perf` command provides support for sampling applications and reading performance counters. `perf` consists of two parts: the kernel space implementation and the userland tools. diff --git a/doc.zih.tu-dresden.de/docs/software/pika.md b/doc.zih.tu-dresden.de/docs/software/pika.md index 3b9cd3fd7ff821f3dc5d76241b46b2645b9fc01b..f84460f8056d8d010406dccc89a9270131cf87d5 100644 --- a/doc.zih.tu-dresden.de/docs/software/pika.md +++ b/doc.zih.tu-dresden.de/docs/software/pika.md @@ -1,4 +1,4 @@ -# PIKA +# Track Slurm Jobs with PIKA PIKA is a hardware performance monitoring stack to identify inefficient HPC jobs. Users of ZIH systems have the possibility to visualize and analyze the efficiency of their jobs via the @@ -6,9 +6,10 @@ systems have the possibility to visualize and analyze the efficiency of their jo !!! hint - To understand this small guide, it is recommended to open the + To understand this guide, it is recommended that you open the [web interface](https://selfservice.zih.tu-dresden.de/l/index.php/hpcportal/jobmonitoring/zih/jobs) - in a separate window. Furthermore, at least one real HPC job should have been submitted. + in a separate window. Furthermore, you should have submitted at least one real HPC job at ZIH + systems. ## Overview @@ -20,11 +21,11 @@ for the visualization and analysis of job performance data. ## Table View and Job Search The analysis of HPC jobs in PIKA is designed as a top-down approach. Starting from the table view, -users can either analyze running or completed jobs. They can navigate from groups of jobs with the +you can either analyze running or completed jobs. You can navigate from groups of jobs with the same name to the metadata of an individual job and finally investigate the job’s runtime metrics in a timeline view. -To find jobs with specific properties, the table can be sorted by any column, e.g., by consumed CPU +To find jobs with specific properties, you can sort the table by any column, e.g., by consumed CPU hours to find jobs where an optimization has a large impact on the system utilization. Additionally, there is a filter mask to find jobs that match several properties. When a job has been selected, the timeline view opens. @@ -32,39 +33,63 @@ timeline view opens. ## Timeline Visualization PIKA provides timeline charts to visualize the resource utilization of a job over time. After a job -is completed, timeline charts can help to identify periods of inefficient resource usage. However, -they are also suitable for the live assessment of performance during the job’s runtime. In case of -unexpected performance behavior, users can cancel the job, thus avoiding long execution with subpar -performance. +is completed, timeline charts can help you to identify periods of inefficient resource usage. +However, they are also suitable for the live assessment of performance during the job’s runtime. In +case of unexpected performance behavior, you can cancel the job, thus avoiding long execution with +subpar performance. + +The following timeline visualization shows a job with 840 cores, spread over 35 (dual-socket +Haswell) nodes that have been allocated for exclusive use. + + +{: align="center"} PIKA provides the following runtime metrics: -|Metric| Hardware Unit| -|---|---| -|CPU Usage|CPU core| -|IPC (instructions per cycle)|CPU core| -|FLOPS (normalized to single precision) |CPU core| -|Main Memory Bandwidth|CPU socket| -|CPU Power|CPU socket| -|Main Memory Utilization|node| -|I/O Bandwidth (local, Lustre) |node| -|I/O Metadata (local, Lustre) |node| -|GPU Usage|GPU device| -|GPU Memory Utilization|GPU device| -|GPU Power Consumption|GPU device| -|GPU Temperature|GPU device| +|Metric| Hardware Unit| Sampling Frequency| +|---|---|---:| +|CPU Usage|CPU core|30s| +|IPC (instructions per cycle)|CPU core|60s| +|FLOPS (normalized to single precision) |CPU core|60s| +|Main Memory Bandwidth|CPU socket|60s| +|CPU Power|CPU socket|60s| +|Main Memory Utilization|node|30s| +|I/O Bandwidth (local, Lustre) |node|30s| +|I/O Metadata (local, Lustre) |node|30s| +|GPU Usage|GPU device|30s| +|GPU Memory Utilization|GPU device|30s| +|GPU Power Consumption|GPU device|30s| +|GPU Temperature|GPU device|30s| Each monitored metric is represented by a timeline, whereby metrics with the same unit and data -source are displayed in a common chart, e.g., different Lustre metadata operations. Each metric is +source are displayed in a common chart, e.g., different Lustre metadata operations. Each metric is measured with a certain granularity concerning the hardware, e.g. per hardware thread, per CPU socket or per node. +Most metrics are recorded every 30 seconds except IPC, FLOPS, Main Memory Bandwidth and Power +Consumption. The latter are determined every 60 seconds, as they are a combination of different +hardware counters, which leads to a higher measurement overhead. Depending on the architecture, +metrics such as normalized FLOPS (2 x double-precision + 1 x single-precision) can require +multiplexing, since single and double precision FLOPS cannot be measured simultaneously. +The sampling frequency cannot be changed by the user. !!! hint Be aware that CPU socket or node metrics can share the resources of other jobs running on the same CPU socket or node. This can result e.g., in cache perturbation and thus a sub-optimal - performance. To get valid performance data for those metrics, it is recommended to submit an - exclusive job! + performance. To get valid performance data for those metrics, it is recommended to submit an + exclusive job (`--exclusive`)! + +If the current partition supports simultaneous multithreading (SMT) the maximum number of hardware +threads per physical core is displayed in the SMT column. The Slurm configuration on ZIH systems +disables SMT by default. Therefore, in the example below, only a maximum CPU usage of 0.5 can be +achieved, since PIKA combines two hardware threads per physical core. If you want to use SMT, you +must set the Slurm environment variable `SLURM_HINT=multithread`. In this case, `srun` distributes +the tasks to all available hardware threads, thus a CPU usage of 1 can be reached. However, the SMT +configuration only refers to the `srun` command. For single node jobs without `srun` command the +tasks are automatically distributed to all available hardware threads. + + +{: align="center"} !!! note @@ -73,7 +98,8 @@ socket or per node. performance data per physical core. The following table explains different timeline visualization modes. -By default, each timeline shows the average value over all hardware units (HUs) per measured interval. +By default, each timeline shows the average value over all hardware units (HUs) per measured +interval. |Visualization Mode| Description| |---|---| @@ -108,9 +134,12 @@ usually contains an unlimited number of values. A scatter plot enables the comb footprint metrics (except for job states and job tags), which is particularly useful for investigating their correlation. + +{: align="center"} + ## Hints -If users wish to perform their own measurement of performance counters using performance tools other +If you wish to perform your own measurement of performance counters using performance tools other than PIKA, it is recommended to disable PIKA monitoring. This can be done using the following Slurm flags in the job script: @@ -123,7 +152,24 @@ flags in the job script: ## Known Issues -The PIKA metric FLOPS is not supported by the Intel Haswell cpu architecture. +The PIKA metric FLOPS is not supported by the Intel Haswell CPU architecture. However, PIKA provides this metric to show the computational intensity. **Do not rely on FLOPS on Haswell!** We use the event `AVX_INSTS_CALC` which counts the `insertf128` instruction. + +## Case Studies + +### Idle CPUs + + +{: align="center"} + +### Blocking I/O Operations + + +{: align="center"} + +### Memory Leaks + + +{: align="center"} diff --git a/doc.zih.tu-dresden.de/docs/software/scorep.md b/doc.zih.tu-dresden.de/docs/software/scorep.md index 0e2dc6c2358c95f47373a2f046f3fe4d643ae643..8966df8ed16637333299805076f0282948b0c376 100644 --- a/doc.zih.tu-dresden.de/docs/software/scorep.md +++ b/doc.zih.tu-dresden.de/docs/software/scorep.md @@ -1,4 +1,4 @@ -# Score-P +# Record Course of Events with Score-P The Score-P measurement infrastructure is a highly scalable and easy-to-use tool suite for profiling, event tracing, and online analysis of HPC applications. Currently, it works with the diff --git a/doc.zih.tu-dresden.de/docs/software/singularity_power9.md b/doc.zih.tu-dresden.de/docs/software/singularity_power9.md index 5daf70465d006799bc3df921dcb4698a8d648eab..080314e52f349f94caf3a1e4ca018807797fd0fa 100644 --- a/doc.zih.tu-dresden.de/docs/software/singularity_power9.md +++ b/doc.zih.tu-dresden.de/docs/software/singularity_power9.md @@ -1,4 +1,4 @@ -# Singularity for Power 9 Architecture +# Singularity for Power9 Architecture !!! note "Root privileges" diff --git a/doc.zih.tu-dresden.de/docs/software/spec.md b/doc.zih.tu-dresden.de/docs/software/spec.md new file mode 100644 index 0000000000000000000000000000000000000000..f567f8de652668d963751bfb9d1efa8ae0461405 --- /dev/null +++ b/doc.zih.tu-dresden.de/docs/software/spec.md @@ -0,0 +1,369 @@ +# Compare System Performance with SPEChpc + +SPEChpc 2021 is a benchmark suite developed by the Standard Performance Evaluation Corporation +(SPEC) for the evaluation of various, heterogeneous HPC systems. Documentation and released +benchmark results can be found on their [web page](https://www.spec.org/hpc2021/). In fact, our +system *Taurus* (partition `haswell`) is the benchmark's reference system and thus represents +the baseline score. + +The tool includes nine real-world scientific applications (see +[benchmark table](https://www.spec.org/hpc2021/docs/result-fields.html#benchmarks)) +with different workload sizes ranging from tiny, small, medium to large, and different +parallelization models including MPI only, MPI+OpenACC, MPI+OpenMP and MPI+OpenMP with target +offloading. With this benchmark suite you can compare the performance of different HPC systems and +furthermore, evaluate parallel strategies for applications on a target HPC system. When you e.g. +want to implement an algorithm, port an application to another platform or integrate acceleration +into your code, you can determine from which target system and parallelization model your +application performance could benefit most. Or this way you can check whether an acceleration scheme +can be deployed and run on a given system, since there could be software issues restricting a +capable hardware (see this [CUDA issue](#cuda-reduction-operation-error)). + +Since TU Dresden is a member of the SPEC consortium, the HPC benchmarks can be requested by anyone +interested. Please contact +[Holger Brunst](https://tu-dresden.de/zih/die-einrichtung/struktur/holger-brunst) for access. + +## Installation + +The target partition determines which of the parallelization models can be used, and vice versa. +For example, if you want to run a model including acceleration, you would have to use a partition +with GPUs. + +Once the target partition is determined, follow SPEC's +[Installation Guide](https://www.spec.org/hpg/hpc2021/Docs/install-guide-linux.html). +It is straight-forward and easy to use. + +???+ tip "Building for partition `ml`" + + The partition `ml` is a Power9 architecture. Thus, you need to provide the `-e ppc64le` switch + when installing. + +???+ tip "Building with NVHPC for partition `alpha`" + + To build the benchmark for partition `alpha`, you don't need an interactive session + on the target architecture. You can stay on the login nodes as long as you set the + flag `-tp=zen`. You can add this compiler flag to the configuration file. + +If you are facing errors during the installation process, check the [solved](#solved-issues) and +[unresolved issues](#unresolved-issues) sections for our systems. The problem might already be +listed there. + +## Configuration + +The behavior in terms of how to build, run and report the benchmark in a particular environment is +controlled by a configuration file. There are a few examples included in the source code. +Here you can apply compiler tuning and porting, specify the runtime environment and describe the +system under test. SPEChpc 2021 has been deployed on the partitions `haswell`, `ml` and +`alpha`. Configurations are available, respectively: + +- [gnu-taurus.cfg](misc/spec_gnu-taurus.cfg) +- [nvhpc-ppc.cfg](misc/spec_nvhpc-ppc.cfg) +- [nvhpc-alpha.cfg](misc/spec_nvhpc-alpha.cfg) + +No matter which one you choose as a starting point, +double-check the line that defines the submit command and make sure it says `srun [...]`, e.g. + +``` bash +submit = srun $command +``` + +Otherwise this can cause trouble (see [Slurm Bug](#slurm-bug)). +You can also put Slurm options in the configuration but it is recommended to do this in a job +script (see chapter [Execution](#execution)). Use the following to apply your configuration to the +benchmark run: + +``` +runhpc --config <configfile.cfg> [...] +``` + +For more details about configuration settings check out the following links: + +- [Config Files Description](https://www.spec.org/hpc2021/Docs/config.html) +- [Flag Description](https://www.spec.org/hpc2021/results/res2021q4/hpc2021-20210917-00050.flags.html) +- [Result File Fields Description](https://www.spec.org/hpc2021/docs/result-fields.html) + +## Execution + +The SPEChpc 2021 benchmark suite is executed with the `runhpc` command, which also sets it's +configuration and controls it's runtime behavior. For all options, see SPEC's documentation about +[`runhpc` options](https://www.spec.org/hpc2021/Docs/runhpc.html). +First, execute `source shrc` in your SPEC installation directory. Then use a job script to submit a +job with the benchmark or parts of it. + +In the following there are job scripts shown for partitions `haswell`, `ml` and `alpha`, +respectively. You can use them as a template in order to reproduce results or to transfer the +execution to a different partition. + +- Replace `<p_number_crunch>` (line 2) with your project name +- Replace `ws=</scratch/ws/spec/installation>` (line 15/18) with your SPEC installation path + +### Submit SPEChpc Benchmarks with a Job File + +=== "submit_spec_haswell_mpi.sh" + ```bash linenums="1" + #!/bin/bash + #SBATCH --account=<p_number_crunch> + #SBATCH --partition=haswell64 + #SBATCH --exclusive + #SBATCH --nodes=1 + #SBATCH --ntasks=24 + #SBATCH --cpus-per-task=1 + #SBATCH --mem-per-cpu=2541M + #SBATCH --time=16:00:00 + #SBATCH --constraint=DA + + module purge + module load gompi/2019a + + ws=</scratch/ws/spec/installation> + cd ${ws} + source shrc + + # reportable run with all benchmarks + BENCH="tiny" + + runhpc --config gnu-taurus --define model=mpi --ranks=24 --reportable --tune=base --flagsurl=$SPEC/config/flags/gcc_flags.xml ${BENCH} + ``` + +=== "submit_spec_ml_openacc.sh" + ```bash linenums="1" + #!/bin/bash + #SBATCH --account=<p_number_crunch> + #SBATCH --partition=ml + #SBATCH --exclusive + #SBATCH --nodes=1 + #SBATCH --ntasks=6 + #SBATCH --cpus-per-task=7 + #SBATCH --gpus-per-task=1 + #SBATCH --gres=gpu:6 + #SBATCH --mem-per-cpu=5772M + #SBATCH --time=00:45:00 + #SBATCH --export=ALL + #SBATCH --hint=nomultithread + + module --force purge + module load modenv/ml NVHPC OpenMPI/4.0.5-NVHPC-21.2-CUDA-11.2.1 + + ws=</scratch/ws/spec/installation> + cd ${ws} + source shrc + + export OMPI_CC=nvc + export OMPI_CXX=nvc++ + export OMPI_FC=nvfortran + + suite='tiny ^pot3d_t' + cfg=nvhpc_ppc.cfg + + # test run + runhpc -I --config ${cfg} --ranks ${SLURM_NTASKS} --define pmodel=acc --size=test --noreportable --tune=base --iterations=1 ${suite} + + # reference run + runhpc --config ${cfg} --ranks ${SLURM_NTASKS} --define pmodel=acc --rebuild --tune=base --iterations=3 ${suite} + ``` + +=== "submit_spec_alpha_openacc.sh" + ```bash linenums="1" + #!/bin/bash + #SBATCH --account=<p_number_crunch> + #SBATCH --partition=alpha + #SBATCH --exclusive + #SBATCH --nodes=1 + #SBATCH --ntasks-per-node=8 + #SBATCH --cpus-per-task=6 + #SBATCH --gpus-per-task=1 + #SBATCH --gres=gpu:8 + #SBATCH --mem-per-cpu=20624M + #SBATCH --time=00:45:00 + #SBATCH --export=ALL + #SBATCH --hint=nomultithread + + module --force purge + module load modenv/hiera NVHPC OpenMPI + + ws=</scratch/ws/spec/installation> + cd ${ws} + source shrc + + suite='tiny' + cfg=nvhpc_alpha.cfg + + # test run + runhpc -I --config ${cfg} --ranks ${SLURM_NTASKS} --define pmodel=acc --size=test --noreportable --tune=base --iterations=1 ${suite} + + # reference workload + runhpc --config ${cfg} --ranks ${SLURM_NTASKS} --define pmodel=acc --tune=base --iterations=3 ${suite} + ``` + +## Solved Issues + +### Fortran Compilation Error + +!!! failure "PGF90-F-0004-Corrupt or Old Module file" + +!!! note "Explanation" + + If this error arises during runtime, it means that the benchmark binaries and the MPI module + do not fit together. This happens when you have built the benchmarks written in Fortran with a + different compiler than which was used to build the MPI module that was loaded for the run. + +!!! success "Solution" + + 1. Use the correct MPI module + - The MPI module in use must be compiled with the same compiler that was used to build the + benchmark binaries. Check the results of `module avail` and choose a corresponding module. + 1. Rebuild the binaries + - Rebuild the binaries using the same compiler as for the compilation of the MPI module of + choice. + 1. Request a new module + - Ask the HPC support to install a compatible MPI module. + 1. Build your own MPI module (as a last resort) + - Download and build a private MPI module using the same compiler as for building the + benchmark binaries. + +### pmix Error + +!!! failure "PMIX ERROR" + + ```bash + It looks like the function `pmix_init` failed for some reason; your parallel process is + likely to abort. There are many reasons that a parallel process can + fail during pmix_init; some of which are due to configuration or + environment problems. This failure appears to be an internal failure; + + mix_progress_thread_start failed + --> Returned value -1 instead of PMIX_SUCCESS + + *** An error occurred in MPI_Init_thread + *** on a NULL communicator + *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort, + *** and potentially your MPI job) + ``` + +!!! note "Explanation" + + This is most probably a MPI related issue. If you built your own MPI module, PMIX support might + be configured wrong. + +!!! success "Solution" + + Use `configure --with-pmix=internal` during the `cmake` configuration routine. + +### ORTE Error (too many processes) + +!!! failure "Error: system limit exceeded on number of processes that can be started" + + ORTE_ERROR_LOG: The system limit on number of children a process can have was reached. + +!!! note "Explanation" + + There are too many processes spawned, probably due to a wrong job allocation and/or invocation. + +!!! success "Solution" + + Check the invocation command line in your job script. It must not say `srun runhpc [...]` + there, but only `runhpc [...]`. The submit command in the [configuration](#configuration) file + already contains `srun`. When `srun` is called in both places, too many parallel processes are + spawned. + +### Error with OpenFabrics Device + +!!! warning "There was an error initializing an OpenFabrics device" + +!!! note "Explanation" + + "I think it’s just trying to find the InfiniBand libraries, which aren’t used, but can’t. + It’s probably safe to ignore." + <p style='text-align: right;'> Matthew Colgrove, Nvidia </p> + +!!! success "Solution" + + This is just a warning which cannot be suppressed, but can be ignored. + +### Out of Memory + +!!! failure "Out of memory" + + ``` + Out of memory allocating [...] bytes of device memory + call to cuMemAlloc returned error 2: Out of memory + ``` + +!!! note "Explanation" + + - When running on a single node with all of its memory allocated, there is not enough memory + for the benchmark. + - When running on multiple nodes, this might be a wrong resource distribution caused by Slurm. + Check the `$SLURM_NTASKS_PER_NODE` environment variable. If it says something like `15,1` when + you requested 8 processes per node, Slurm was not able to hand over the resource distribution + to `mpirun`. + +!!! success "Solution" + + - Expand your job from single node to multiple nodes. + - Reduce the workload (e.g. form small to tiny). + - Make sure to use `srun` instead of `mpirun` as the submit command in your + [configuration](#configuration) file. + +## Unresolved Issues + +### CUDA Reduction Operation Error + +!!! failure "There was a problem while initializing support for the CUDA reduction operations." + +!!! note "Explanation" + + For OpenACC, NVHPC was in the process of adding OpenMP array reduction support which is needed + for the `pot3d` benchmark. An Nvidia driver version of 450.80.00 or higher is required. Since + the driver version on partiton `ml` is 440.64.00, it is not supported and not possible to run + the `pot3d` benchmark in OpenACC mode here. + +!!! note "Workaround" + + As for the partition `ml`, you can only wait until the OS update to CentOS 8 is carried out, + as no driver update will be done beforehand. As a workaround, you can do one of the following: + + - Exclude the `pot3d` benchmark. + - Switch the partition (e.g. to partition `alpha`). + +### Slurm Bug + +!!! warning "Wrong resource distribution" + + When working with multiple nodes on partition `ml` or `alpha`, the Slurm parameter + `$SLURM_NTASKS_PER_NODE` does not work as intended when used in conjunction with `mpirun`. + +!!! note "Explanation" + + In the described case, when setting e.g. `SLURM_NTASKS_PER_NODE=8` and calling `mpirun`, Slurm + is not able to pass on the allocation settings correctly. With two nodes, this leads to a + distribution of 15 processes on the first node and 1 process on the second node instead. In + fact, none of the proposed methods of Slurm's man page (like `--distribution=plane=8`) will + give the result as intended in this case. + +!!! note "Workaround" + + - Use `srun` instead of `mpirun`. + - Use `mpirun` along with a rank-binding perl script (like + `mpirun -np <ranks> perl <bind.pl> <command>`) as seen on the bottom of the configurations + [here](https://www.spec.org/hpc2021/results/res2021q4/hpc2021-20210908-00012.cfg) and + [here](https://www.spec.org/hpc2021/results/res2021q4/hpc2021-20210917-00056.cfg) + in order to enforce the correct distribution of ranks as it was intended. + +### Benchmark Hangs Forever + +!!! warning "The benchmark runs forever and produces a timeout." + +!!! note "Explanation" + + The reason for this is not known, however, it is caused by the flag `-DSPEC_ACCEL_AWARE_MPI`. + +!!! note "Workaround" + + Remove the flag `-DSPEC_ACCEL_AWARE_MPI` from the compiler options in your configuration file. + +### Other Issues + +For any further issues you can consult SPEC's +[FAQ page](https://www.spec.org/hpc2021/Docs/faq.html), search through their +[known issues](https://www.spec.org/hpc2021/Docs/known-problems.html) or contact their +[support](https://www.spec.org/hpc2021/Docs/techsupport.html). diff --git a/doc.zih.tu-dresden.de/docs/software/tensorflow.md b/doc.zih.tu-dresden.de/docs/software/tensorflow.md index 58b99bd1c302c0ed65619fc200602f2732f84df1..f11ecb3ac94e3cc65cf671815d813bacc9b9815f 100644 --- a/doc.zih.tu-dresden.de/docs/software/tensorflow.md +++ b/doc.zih.tu-dresden.de/docs/software/tensorflow.md @@ -96,7 +96,7 @@ the notebook by pre-loading a specific TensorFlow module: You can also define your own Jupyter kernel for more specific tasks. Please read about Jupyter kernels and virtual environments in our - [JupyterHub](../access/jupyterhub.md#creating-and-using-a-custom-environment) documentation. + [JupyterHub](../access/jupyterhub_custom_environments.md) documentation. ## TensorFlow in Containers diff --git a/doc.zih.tu-dresden.de/docs/software/vampir.md b/doc.zih.tu-dresden.de/docs/software/vampir.md index ebaa368e73f445422644b6159c1ab677fc50fecf..efbc0717fb00e1e889c16bc6ab18e8d7db51836b 100644 --- a/doc.zih.tu-dresden.de/docs/software/vampir.md +++ b/doc.zih.tu-dresden.de/docs/software/vampir.md @@ -1,4 +1,4 @@ -# Vampir +# Study Course of Events with Vampir ## Introduction @@ -73,7 +73,23 @@ Launching VampirServer... Submitting slurm 30 minutes job (this might take a while)... ``` -Above automatically allocates its resources via the respective batch system. If you want to start +This way, a job with a timelimit of 30 minutes and default resources is submitted. This might fit +your needs. If not, please feel free to request a **customized job** running VampirServer, e.g. + +```console +marie@login$ vampirserver start --ntasks=8 -- --time=01:00:00 -- --mem-per-cpu=3000M --partition=romeo +Launching VampirServer... +Submitting slurm 01:00:00 minutes job (this might take a while)... +``` + +The above `vampirserver` command automatically allocates its resources via the respective batch +system (, i.e. [Slurm](../jobs_and_resources/slurm.md) on ZIH systems). As shown, you can customize +the resources requirements and time limit. This is especially useful, if you run into performance +issues handling very large trace files. Please refer to `vampirserver --help` for further options +and usage. + +If you want to start + VampirServer without a batch allocation or from inside an interactive allocation, use ```console diff --git a/doc.zih.tu-dresden.de/docs/support/support.md b/doc.zih.tu-dresden.de/docs/support/support.md index c2c9fbda8bbb70c1dddb82fb384b69a8201e6fb8..3582ae264c4ead7f41acdf14e9877af91b8c2d57 100644 --- a/doc.zih.tu-dresden.de/docs/support/support.md +++ b/doc.zih.tu-dresden.de/docs/support/support.md @@ -1,4 +1,4 @@ -# How to Ask for Support +# User Support ## Create a Ticket diff --git a/doc.zih.tu-dresden.de/mkdocs.yml b/doc.zih.tu-dresden.de/mkdocs.yml index 7c68972f5ea34a55bdd0cd8a38f0c17e2e94578a..098c08b74461bd678d2745b9013e95513591c8bf 100644 --- a/doc.zih.tu-dresden.de/mkdocs.yml +++ b/doc.zih.tu-dresden.de/mkdocs.yml @@ -5,28 +5,28 @@ nav: - Overview: application/overview.md - Terms of Use: application/terms_of_use.md - Request for Resources: application/request_for_resources.md - - Project Request Form: application/project_request_form.md + - Project Request Form Jards: application/project_request_form.md - Project Management: application/project_management.md - Acknowledgement: application/acknowledgement.md - Access to ZIH Systems: - Overview: access/overview.md - Connecting with SSH: - - Connecting via terminal (Linux, Mac, Windows): access/ssh_login.md - - Connecting via MobaXterm (Windows): access/ssh_mobaxterm.md - - Connecting via Putty (Windows): access/ssh_putty.md + - Connecting via Terminal (Linux, Mac, Windows): access/ssh_login.md + - Connecting with MobaXterm (Windows): access/ssh_mobaxterm.md + - Connecting with PuTTY (Windows): access/ssh_putty.md - Desktop Cloud Visualization (DCV): access/desktop_cloud_visualization.md - Graphical Applications with WebVNC: access/graphical_applications_with_webvnc.md - JupyterHub: - JupyterHub: access/jupyterhub.md - - JupyterHub Custom Environments: access/jupyterhub_custom_environments.md + - Custom Environments for JupyterHub: access/jupyterhub_custom_environments.md - JupyterHub for Teaching: access/jupyterhub_for_teaching.md - JupyterHub Teaching Example: access/jupyterhub_teaching_example.md - Key Fingerprints: access/key_fingerprints.md - Security Restrictions: access/security_restrictions.md - Data Transfer: - Overview: data_transfer/overview.md - - Datamover: data_transfer/datamover.md - - Export Nodes: data_transfer/export_nodes.md + - Transfer Data Inside ZIH Systems with Datamover: data_transfer/datamover.md + - Transfer Data to/from ZIH Systems via Export Nodes: data_transfer/export_nodes.md - Data Life Cycle Management: - Overview: data_lifecycle/overview.md - Filesystems: @@ -37,7 +37,7 @@ nav: - Warm Archive: data_lifecycle/warm_archive.md - Intermediate Archive: data_lifecycle/intermediate_archive.md - Workspaces: data_lifecycle/workspaces.md - - Long-Term Preservation: data_lifecycle/longterm_preservation.md + - Long-Term Preservation of Research Data: data_lifecycle/longterm_preservation.md - Sharing Data: data_lifecycle/data_sharing.md - User Environment: - Overview: software/overview.md @@ -50,7 +50,7 @@ nav: - Containers: - Singularity: software/containers.md - Singularity Recipes and Hints: software/singularity_recipe_hints.md - - Singularity for Power9: software/singularity_power9.md + - Singularity for Power9 Architecture: software/singularity_power9.md - Virtual Machines: software/virtual_machines.md - GPU-accelerated Containers for Deep Learning (NGC Containers): software/ngc_containers.md - External Licenses: software/licenses.md @@ -81,7 +81,7 @@ nav: - Compilers and Flags: software/compilers.md - GPU Programming: software/gpu_programming.md - Mathematics Libraries: software/math_libraries.md - - MPI Usage Issues: jobs_and_resources/mpi_issues.md + - Known Issues when Using MPI: jobs_and_resources/mpi_issues.md - Debugging: software/debuggers.md - Performance Engineering Tools: - Overview: software/performance_engineering_overview.md @@ -94,6 +94,7 @@ nav: - Profile Jobs with Slurm: jobs_and_resources/slurm_profiling.md - Study Course of Events with Vampir: software/vampir.md - Measure Energy Consumption: software/energy_measurement.md + - Compare System Performance with SPEChpc: software/spec.md - Utilities: software/utilities.md - HPC Resources and Jobs: - Overview: jobs_and_resources/overview.md @@ -111,21 +112,21 @@ nav: - Slurm Job File Generator: jobs_and_resources/slurm_generator.md - Checkpoint/Restart: jobs_and_resources/checkpoint_restart.md - Job Profiling: jobs_and_resources/slurm_profiling.md - - Binding And Distribution Of Tasks: jobs_and_resources/binding_and_distribution_of_tasks.md + - Binding and Distribution of Tasks: jobs_and_resources/binding_and_distribution_of_tasks.md - User Support: support/support.md - Archive: - Overview: archive/overview.md - Bio Informatics: archive/bioinformatics.md - CXFS End of Support: archive/cxfs_end_of_support.md - Load Leveler: archive/load_leveler.md - - No IB Jobs: archive/no_ib_jobs.md - - Phase2 Migration: archive/phase2_migration.md + - Jobs without InfiniBand: archive/no_ib_jobs.md + - Migration towards Phase 2: archive/phase2_migration.md - Platform LSF: archive/platform_lsf.md - - BeeGFS on Demand: archive/beegfs_on_demand.md - - Install JupyterHub: archive/install_jupyter.md + - BeeGFS Filesystem on Demand: archive/beegfs_on_demand.md + - Jupyter Installation: archive/install_jupyter.md - Switched-Off Systems: - Overview: archive/systems_switched_off.md - - From Deimos to Atlas: archive/migrate_to_atlas.md + - Migration From Deimos to Atlas: archive/migrate_to_atlas.md - System Altix: archive/system_altix.md - System Atlas: archive/system_atlas.md - System Deimos: archive/system_deimos.md diff --git a/doc.zih.tu-dresden.de/util/check-toc-equals-page-headings.py b/doc.zih.tu-dresden.de/util/check-toc-equals-page-headings.py new file mode 100755 index 0000000000000000000000000000000000000000..75f85df0821772480c0ce78336250921c3614a03 --- /dev/null +++ b/doc.zih.tu-dresden.de/util/check-toc-equals-page-headings.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python +""" +Check for consistency between TOC and page headings. + +Author: Michael Bommhardt-Richter +""" + +import argparse +import sys +from pathlib import Path + +# {path/filename.md: [toc_heading, file_heading], ... } +TOCData = dict() + +whitelist = ["index.md","archive/"] + + +def get_heading_in_file(filename, docs_path): + # Read until first level one heading is found + f = Path.joinpath(docs_path, filename) + with open(f, "r") as file: + for line in file: + if line.startswith("#"): + # TODO Make sure it is really a level one heading! + # Will be empty if there is more than one "#". + return line.split("#")[1].strip() + + +def main(): + scriptpath = Path(__file__).resolve().parent + mkdocsyaml = Path.joinpath(scriptpath, "../", "mkdocs.yml") + if Path.exists(mkdocsyaml): + + docs_path = Path.joinpath(scriptpath, "../", "docs") + with open(mkdocsyaml, "r") as file: + c = file.readlines() + + for line in c: + line = line.rstrip() + + # "headline: path/file.md" -> "Headline" = "path/file.md" + if line.endswith(".md"): + line = line.split(" - ")[1] + line = line.split(": ") + + key = line[1] + file_heading = get_heading_in_file(line[1], docs_path) + TOCData[line[1]] = [line[0], file_heading] + + # Check TOC vs heading in corresponding md-file + cnt = 0 + for key, value in TOCData.items(): + if key in whitelist: + continue + if whitelist[1] in key: + continue + if value[0] == "Overview": + continue + if value[0] != value[1]: + cnt += 1 + print(f"{key:<40}{value[0]:<50} != {value[1]}") + sys.exit(cnt) + else: + print("Error: Could not find mkdocs.yml file.") + sys.exit(-1) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Find differences in TOC and top level headings of md-files." + ) + args = parser.parse_args() + + main() diff --git a/doc.zih.tu-dresden.de/util/download-newest-mermaid.js.sh b/doc.zih.tu-dresden.de/util/download-newest-mermaid.js.sh index 9986ad6f49e2e739f8a53d7911f4e346196d21a4..d01622e4bba4188479370be170339b1f01308074 100755 --- a/doc.zih.tu-dresden.de/util/download-newest-mermaid.js.sh +++ b/doc.zih.tu-dresden.de/util/download-newest-mermaid.js.sh @@ -6,4 +6,4 @@ scriptpath=${BASH_SOURCE[0]} basedir=`dirname "$scriptpath"` basedir=`dirname "$basedir"` cd $basedir/tud_theme/javascripts -wget https://unpkg.com/mermaid/dist/mermaid.min.js +wget https://unpkg.com/mermaid@9.4.0/dist/mermaid.min.js diff --git a/doc.zih.tu-dresden.de/wordlist.aspell b/doc.zih.tu-dresden.de/wordlist.aspell index 18b8555ca8c9e6f4756b57e9920c6fbeb5ba77df..54c0092c1c8cc3eda9c37f9780936ffe7ecc8b29 100644 --- a/doc.zih.tu-dresden.de/wordlist.aspell +++ b/doc.zih.tu-dresden.de/wordlist.aspell @@ -172,6 +172,7 @@ ifort ImageNet img Infiniband +InfluxDB init inode Instrumenter @@ -248,6 +249,7 @@ multicore multiphysics Multiphysics multithreaded +multithreading Multithreading NAMD Nationales