diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000000000000000000000000000000000000..b09f511654852233da87ba9ccc35d29f0095004d --- /dev/null +++ b/.editorconfig @@ -0,0 +1,28 @@ +# EditorConfig is awesome: https://EditorConfig.org + +# top-most EditorConfig file +root = true + +# Unix-style newlines with a newline ending every file +[*] +end_of_line = lf +insert_final_newline = true + +# Matches multiple files with brace expansion notation +# Set default charset +[*.{md,js,py}] +charset = utf-8 + +# 4 space indentation +[*.{md,py}] +indent_style = space +indent_size = 4 + +# Tab indentation (no size specified) +[Makefile] +indent_style = tab + +# Indentation override for all JS under lib directory +[lib/**.js] +indent_style = space +indent_size = 2 diff --git a/.gitignore b/.gitignore index ed9ec7dd5f3338e0cda169471c748dbdf5038a58..04c7fd320b19a3da2344057a2fd78ef420e71499 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *package-lock.json *package.json *node_modules -**venv/ \ No newline at end of file +**venv/ +doc.zih.tu-dresden.de/public/ diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index cd73b9ea1ac450adb444920a10346f7d7d86cccb..f1875b3481da6d11053e5ad8aed49ae53033e5c4 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,6 +1,6 @@ variables: GIT_STRATEGY: none - DOCKER_IMAGE: webpage:all + DOCKER_IMAGE: webpage:$CI_PIPELINE_ID workflow: rules: @@ -21,52 +21,67 @@ Build Linter: variables: GIT_STRATEGY: clone GIT_DEPTH: 0 - script: docker build -t ${DOCKER_IMAGE} . + script: docker build -t "${DOCKER_IMAGE}" . Test mkdocs: stage: test script: docker run ${DOCKER_IMAGE} +Check wording of changed md-files: + stage: test + script: + - docker run --rm -w /src -e CI_MERGE_REQUEST_TARGET_BRANCH_NAME "${DOCKER_IMAGE}" + doc.zih.tu-dresden.de/util/grep-forbidden-words.sh + only: [ merge_requests ] + Lint changed md-files: stage: test script: - - docker run --rm -w /src -e CI_MERGE_REQUEST_TARGET_BRANCH_NAME ${DOCKER_IMAGE} + - docker run --rm -w /src -e CI_MERGE_REQUEST_TARGET_BRANCH_NAME "${DOCKER_IMAGE}" doc.zih.tu-dresden.de/util/lint-changes.sh only: [ merge_requests ] +Check spelling for changed md-files: + stage: test + script: + - docker run --rm -w /src -e CI_MERGE_REQUEST_TARGET_BRANCH_NAME "${DOCKER_IMAGE}" + doc.zih.tu-dresden.de/util/check-spelling.sh + only: [ merge_requests ] + Check links for changed md-files: stage: test script: - - docker run --rm -w /src -e CI_MERGE_REQUEST_TARGET_BRANCH_NAME ${DOCKER_IMAGE} + - docker run --rm -w /src -e CI_MERGE_REQUEST_TARGET_BRANCH_NAME "${DOCKER_IMAGE}" doc.zih.tu-dresden.de/util/check-links.sh only: [ merge_requests ] Lint md-files: stage: test - script: docker run --rm ${DOCKER_IMAGE} markdownlint docs + script: docker run --rm "${DOCKER_IMAGE}" markdownlint docs only: [ main, preview ] Check links for md-files: stage: test script: - - docker run --rm ${DOCKER_IMAGE} + - docker run --rm "${DOCKER_IMAGE}" bash -c "find docs -type f -name '*.md' | xargs -L1 markdown-link-check --quiet" only: [ main, preview ] Release preview branch: stage: release script: - - docker run --rm -v /var/www/html/preview:/mnt ${DOCKER_IMAGE} mkdocs build --site-dir /mnt + - docker run --rm -v /var/www/html/preview:/mnt "${DOCKER_IMAGE}" mkdocs build --strict --site-dir /mnt only: [ preview ] Release: stage: release script: - - docker run --rm -v /var/www/html/hpc-wiki:/mnt ${DOCKER_IMAGE} mkdocs build --site-dir /mnt + - docker run --rm -v /var/www/html/hpc-wiki:/mnt "${DOCKER_IMAGE}" mkdocs build --strict --site-dir /mnt only: [ main ] Cleanup docker: stage: cleanup script: + - docker rmi -f "${DOCKER_IMAGE}" - docker system prune --force when: always diff --git a/Dockerfile b/Dockerfile index 67ffffaa2e29c4effe35714e3dca8128872252f6..731e831c9b2fc1ff1068ae2b2a80c04bbf0039c7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.8-buster +FROM python:3.8-bullseye ######## # Base # @@ -18,4 +18,4 @@ RUN npm install -g markdownlint-cli markdown-link-check WORKDIR /src/doc.zih.tu-dresden.de -CMD ["mkdocs", "build", "--verbose"] +CMD ["mkdocs", "build", "--verbose", "--strict"] diff --git a/doc.zih.tu-dresden.de/README.md b/doc.zih.tu-dresden.de/README.md index 6859e5ab8d39cbdf01bf77ba8438da49d1eb39da..31344cece97859451158faa45a172ebcacea1752 100644 --- a/doc.zih.tu-dresden.de/README.md +++ b/doc.zih.tu-dresden.de/README.md @@ -454,10 +454,8 @@ there is a list of conventions w.r.t. spelling and technical wording. * `Slurm` not `SLURM` * `Filesystem` not `file system` * `ZIH system` and `ZIH systems` not `Taurus`, `HRSKII`, `our HPC systems` etc. - -**TODO:** Put into file - -**TODO:** Implement checks [Issue #13](#13) +* `Workspace` not `work space` +* avoid term `HPC-DA` ### Code Blocks and Command Prompts diff --git a/doc.zih.tu-dresden.de/docs/accessibility.md b/doc.zih.tu-dresden.de/docs/accessibility.md new file mode 100644 index 0000000000000000000000000000000000000000..418d8a11c98be59a121a47f0d497dfce1a79aa05 --- /dev/null +++ b/doc.zih.tu-dresden.de/docs/accessibility.md @@ -0,0 +1,42 @@ +# Erklärung zur Barrierefreiheit + +Diese Erklärung zur Barrierefreiheit gilt für die unter +[https://doc.zih.tu-dresden.de](https://doc.zih.tu-dresden.de) und +[https://hpc-wiki.zih.tu-dresden.de](https://hpc-wiki.zih.tu-dresden.de) veröffentlichte Website +der Technischen Universität Dresden. +Als öffentliche Stelle im Sinne des Barrierefreie-Websites-Gesetz (BfWebG) ist die Technische +Universität Dresden bemüht, ihre Websites und mobilen Anwendungen im Einklang mit den Bestimmungen +des Barrierefreie-Websites-Gesetz (BfWebG) in Verbindung mit der +Barrierefreie-Informationstechnik-Verordnung (BITV 2.0) barrierefrei zugänglich zu machen. + +## Erstellung dieser Erklärung zur Barrierefreiheit + +Diese Erklärung wurde am 17.09.2020 erstellt und zuletzt am 17.09.2020 aktualisiert. Grundlage der +Erstellung dieser Erklärung zur Barrierefreiheit ist eine am 17.09.2020 von der TU Dresden +durchgeführte Selbstbewertung. + +## Stand der Barrierefreiheit + +Es wurde bisher noch kein BITV-Test für die Website durchgeführt. Dieser ist bis 30.11.2020 geplant. + +## Kontakt + +Sollten Ihnen Mängel in Bezug auf die barrierefreie Gestaltung auffallen, können Sie uns diese über +das Formular [Barriere melden](https://tu-dresden.de/barrierefreiheit/barriere-melden) mitteilen und +im zugänglichen Format anfordern. Alternativ können Sie sich direkt an die Meldestelle für Barrieren +wenden (Koordinatorin: Mandy Weickert, E-Mail: <barrieren@tu-dresden.de>, Telefon: +49 351 +463-42022, Fax: +49 351 463-42021, Besucheradresse: Nöthnitzer Straße 46, APB 1102, 01187 Dresden). + +## Durchsetzungsverfahren + +Wenn wir Ihre Rückmeldungen aus Ihrer Sicht nicht befriedigend bearbeiten, können Sie sich an die +Sächsische Durchsetzungsstelle wenden: + +Beauftragter der Sächsischen Staatsregierung für die Belange von Menschen mit Behinderungen +Albertstraße 10 +01097 Dresden +Postanschrift: Archivstraße 1, 01097 Dresden +E-Mail: <info.behindertenbeauftragter@sk.sachsen.de> +Telefon: +49 351 564-12161 +Fax: +49 351 564-12169 +Webseite: [https://www.inklusion.sachsen.de](https://www.inklusion.sachsen.de) diff --git a/doc.zih.tu-dresden.de/docs/application/project_request_form.md b/doc.zih.tu-dresden.de/docs/application/project_request_form.md index 07ed2eeb7d86c1041c55ae541a6a175f9df45d24..7a50b2274b2167e5d2efd89c7a4b1725074e8990 100644 --- a/doc.zih.tu-dresden.de/docs/application/project_request_form.md +++ b/doc.zih.tu-dresden.de/docs/application/project_request_form.md @@ -9,10 +9,10 @@ type="frame" align="right" caption="picture 1: login screen" width="170" zoom="on ">%ATTACHURL%/request_step1_b.png</span> -The first step is asking for the personal informations of the requester. +The first step is asking for the personal information of the requester. **That's you**, not the leader of this project! \<br />If you have an ZIH-Login, you can use it \<sup>\[Pic 1\]\</sup>. If not, you have to -fill in the whole informations \<sup>\[Pic.:2\]\</sup>. <span +fill in the whole information \<sup>\[Pic.:2\]\</sup>. <span class="twiki-macro IMAGE">clear</span> ## second step (project details) @@ -27,8 +27,8 @@ general project Details.\<br />Any project have: - Projects starts at the first of a month and ends on the last day of a month. So you are not able to send on the second of a month a project request which start in this month. - - The approval is for a maximum of one year. Be carfull: a - duratoin from "May, 2013" till "May 2014" has 13 month. + - The approval is for a maximum of one year. Be careful: a + duration from "May, 2013" till "May 2014" has 13 month. - a selected science, according to the DFG: <http://www.dfg.de/dfg_profil/gremien/fachkollegien/faecher/index.jsp> - a sponsorship @@ -45,7 +45,7 @@ general project Details.\<br />Any project have: <span class="twiki-macro IMAGE" type="frame" align="right" caption="picture 4: hardware" width="170" zoom="on ">%ATTACHURL%/request_step3_machines.png</span> This step inquire the -required hardware. You can find the specifications [here](../archive/hardware.md). +required hardware. You can find the specifications [here]**todo fix link** \<br />For your guidance: - gpu => taurus diff --git a/doc.zih.tu-dresden.de/docs/archive/debugging_tools.md b/doc.zih.tu-dresden.de/docs/archive/debugging_tools.md deleted file mode 100644 index 0d902d2cfeb23f9ca1763df909d6746b16be81da..0000000000000000000000000000000000000000 --- a/doc.zih.tu-dresden.de/docs/archive/debugging_tools.md +++ /dev/null @@ -1,14 +0,0 @@ -# Debugging Tools - -Debugging is an essential but also rather time consuming step during application development. Tools -dramatically reduce the amount of time spent to detect errors. Besides the "classical" serial -programming errors, which may usually be easily detected with a regular debugger, there exist -programming errors that result from the usage of OpenMP, Pthreads, or MPI. These errors may also be -detected with debuggers (preferably debuggers with support for parallel applications), however, -specialized tools like MPI checking tools (e.g. Marmot) or thread checking tools (e.g. Intel Thread -Checker) can simplify this task. The following sections provide detailed information about the -different types of debugging tools: - -- [Debuggers] **todo** Debuggers -- debuggers (with and without support for parallel applications) -- [MPI Usage Error Detection] **todo** MPI Usage Error Detection -- tools to detect MPI usage errors -- [Thread Checking] **todo** Thread Checking -- tools to detect OpenMP/Pthread usage errors diff --git a/doc.zih.tu-dresden.de/docs/archive/deep_learning.md b/doc.zih.tu-dresden.de/docs/archive/deep_learning.md deleted file mode 100644 index da8c9c461fddc3c870ef418bb7db2b1ed493abe8..0000000000000000000000000000000000000000 --- a/doc.zih.tu-dresden.de/docs/archive/deep_learning.md +++ /dev/null @@ -1,333 +0,0 @@ -# Deep learning - -**Prerequisites**: To work with Deep Learning tools you obviously need [Login](../access/ssh_login.md) -for the Taurus system and basic knowledge about Python, Slurm manager. - -**Aim** of this page is to introduce users on how to start working with Deep learning software on -both the ml environment and the scs5 environment of the Taurus system. - -## Deep Learning Software - -### TensorFlow - -[TensorFlow](https://www.tensorflow.org/guide/) is a free end-to-end open-source software library -for dataflow and differentiable programming across a range of tasks. - -TensorFlow is available in both main partitions -[ml environment and scs5 environment](modules.md#module-environments) -under the module name "TensorFlow". However, for purposes of machine learning and deep learning, we -recommend using Ml partition [HPC-DA](../jobs_and_resources/hpcda.md). For example: - -```Bash -module load TensorFlow -``` - -There are numerous different possibilities on how to work with [TensorFlow](tensorflow.md) on -Taurus. On this page, for all examples default, scs5 partition is used. Generally, the easiest way -is using the [modules system](modules.md) -and Python virtual environment (test case). However, in some cases, you may need directly installed -TensorFlow stable or night releases. For this purpose use the -[EasyBuild](custom_easy_build_environment.md), [Containers](tensorflow_container_on_hpcda.md) and see -[the example](https://www.tensorflow.org/install/pip). For examples of using TensorFlow for ml partition -with module system see [TensorFlow page for HPC-DA](tensorflow.md). - -Note: If you are going used manually installed TensorFlow release we recommend use only stable -versions. - -## Keras - -[Keras](https://keras.io/) is a high-level neural network API, written in Python and capable of -running on top of [TensorFlow](https://github.com/tensorflow/tensorflow) Keras is available in both -environments [ml environment and scs5 environment](modules.md#module-environments) under the module -name "Keras". - -On this page for all examples default scs5 partition used. There are numerous different -possibilities on how to work with [TensorFlow](tensorflow.md) and Keras -on Taurus. Generally, the easiest way is using the [module system](modules.md) and Python -virtual environment (test case) to see TensorFlow part above. -For examples of using Keras for ml partition with the module system see the -[Keras page for HPC-DA](keras.md). - -It can either use TensorFlow as its backend. As mentioned in Keras documentation Keras capable of -running on Theano backend. However, due to the fact that Theano has been abandoned by the -developers, we don't recommend use Theano anymore. If you wish to use Theano backend you need to -install it manually. To use the TensorFlow backend, please don't forget to load the corresponding -TensorFlow module. TensorFlow should be loaded automatically as a dependency. - -Test case: Keras with TensorFlow on MNIST data - -Go to a directory on Taurus, get Keras for the examples and go to the examples: - -```Bash -git clone https://github.com/fchollet/keras.git'>https://github.com/fchollet/keras.git -cd keras/examples/ -``` - -If you do not specify Keras backend, then TensorFlow is used as a default - -Job-file (schedule job with sbatch, check the status with 'squeue -u \<Username>'): - -```Bash -#!/bin/bash -#SBATCH --gres=gpu:1 # 1 - using one gpu, 2 - for using 2 gpus -#SBATCH --mem=8000 -#SBATCH -p gpu2 # select the type of nodes (options: haswell, smp, sandy, west, gpu, ml) K80 GPUs on Haswell node -#SBATCH --time=00:30:00 -#SBATCH -o HLR_<name_of_your_script>.out # save output under HLR_${SLURMJOBID}.out -#SBATCH -e HLR_<name_of_your_script>.err # save error messages under HLR_${SLURMJOBID}.err - -module purge # purge if you already have modules loaded -module load modenv/scs5 # load scs5 environment -module load Keras # load Keras module -module load TensorFlow # load TensorFlow module - -# if you see 'broken pipe error's (might happen in interactive session after the second srun -command) uncomment line below -# module load h5py - -python mnist_cnn.py -``` - -Keep in mind that you need to put the bash script to the same folder as an executable file or -specify the path. - -Example output: - -```Bash -x_train shape: (60000, 28, 28, 1) 60000 train samples 10000 test samples Train on 60000 samples, -validate on 10000 samples Epoch 1/12 - -128/60000 [..............................] - ETA: 12:08 - loss: 2.3064 - acc: 0.0781 256/60000 -[..............................] - ETA: 7:04 - loss: 2.2613 - acc: 0.1523 384/60000 -[..............................] - ETA: 5:22 - loss: 2.2195 - acc: 0.2005 - -... - -60000/60000 [==============================] - 128s 2ms/step - loss: 0.0296 - acc: 0.9905 - -val_loss: 0.0268 - val_acc: 0.9911 Test loss: 0.02677746053306255 Test accuracy: 0.9911 -``` - -## Datasets - -There are many different datasets designed for research purposes. If you would like to download some -of them, first of all, keep in mind that many machine learning libraries have direct access to -public datasets without downloading it (for example -[TensorFlow Datasets](https://www.tensorflow.org/datasets). - -If you still need to download some datasets, first of all, be careful with the size of the datasets -which you would like to download (some of them have a size of few Terabytes). Don't download what -you really not need to use! Use login nodes only for downloading small files (hundreds of the -megabytes). For downloading huge files use [DataMover](../data_transfer/data_mover.md). -For example, you can use command `dtwget` (it is an analogue of the general wget -command). This command submits a job to the data transfer machines. If you need to download or -allocate massive files (more than one terabyte) please contact the support before. - -### The ImageNet dataset - -The [ImageNet](http://www.image-net.org/) project is a large visual database designed for use in -visual object recognition software research. In order to save space in the file system by avoiding -to have multiple duplicates of this lying around, we have put a copy of the ImageNet database -(ILSVRC2012 and ILSVR2017) under `/scratch/imagenet` which you can use without having to download it -again. For the future, the ImageNet dataset will be available in `/warm_archive`. ILSVR2017 also -includes a dataset for recognition objects from a video. Please respect the corresponding -[Terms of Use](https://image-net.org/download.php). - -## Jupyter Notebook - -Jupyter notebooks are a great way for interactive computing in your web browser. Jupyter allows -working with data cleaning and transformation, numerical simulation, statistical modelling, data -visualization and of course with machine learning. - -There are two general options on how to work Jupyter notebooks using HPC: remote Jupyter server and -JupyterHub. - -These sections show how to run and set up a remote Jupyter server within a sbatch GPU job and which -modules and packages you need for that. - -**Note:** On Taurus, there is a [JupyterHub](../access/jupyterhub.md), where you do not need the -manual server setup described below and can simply run your Jupyter notebook on HPC nodes. Keep in -mind, that, with JupyterHub, you can't work with some special instruments. However, general data -analytics tools are available. - -The remote Jupyter server is able to offer more freedom with settings and approaches. - -### Preparation phase (optional) - -On Taurus, start an interactive session for setting up the -environment: - -```Bash -srun --pty -n 1 --cpus-per-task=2 --time=2:00:00 --mem-per-cpu=2500 --x11=first bash -l -i -``` - -Create a new subdirectory in your home, e.g. Jupyter - -```Bash -mkdir Jupyter cd Jupyter -``` - -There are two ways how to run Anaconda. The easiest way is to load the Anaconda module. The second -one is to download Anaconda in your home directory. - -1. Load Anaconda module (recommended): - -```Bash -module load modenv/scs5 module load Anaconda3 -``` - -1. Download latest Anaconda release (see example below) and change the rights to make it an -executable script and run the installation script: - -```Bash -wget https://repo.continuum.io/archive/Anaconda3-2019.03-Linux-x86_64.sh chmod 744 -Anaconda3-2019.03-Linux-x86_64.sh ./Anaconda3-2019.03-Linux-x86_64.sh - -(during installation you have to confirm the license agreement) -``` - -Next step will install the anaconda environment into the home -directory (/home/userxx/anaconda3). Create a new anaconda environment with the name "jnb". - -```Bash -conda create --name jnb -``` - -### Set environmental variables on Taurus - -In shell activate previously created python environment (you can -deactivate it also manually) and install Jupyter packages for this python environment: - -```Bash -source activate jnb conda install jupyter -``` - -If you need to adjust the configuration, you should create the template. Generate config files for -Jupyter notebook server: - -```Bash -jupyter notebook --generate-config -``` - -Find a path of the configuration file, usually in the home under `.jupyter` directory, e.g. -`/home//.jupyter/jupyter_notebook_config.py` - -Set a password (choose easy one for testing), which is needed later on to log into the server -in browser session: - -```Bash -jupyter notebook password Enter password: Verify password: -``` - -You get a message like that: - -```Bash -[NotebookPasswordApp] Wrote *hashed password* to -/home/<zih_user>/.jupyter/jupyter_notebook_config.json -``` - -I order to create an SSL certificate for https connections, you can create a self-signed -certificate: - -```Bash -openssl req -x509 -nodes -days 365 -newkey rsa:1024 -keyout mykey.key -out mycert.pem -``` - -Fill in the form with decent values. - -Possible entries for your Jupyter config (`.jupyter/jupyter_notebook*config.py*`). Uncomment below -lines: - -```Bash -c.NotebookApp.certfile = u'<path-to-cert>/mycert.pem' c.NotebookApp.keyfile = -u'<path-to-cert>/mykey.key' - -# set ip to '*' otherwise server is bound to localhost only c.NotebookApp.ip = '*' -c.NotebookApp.open_browser = False - -# copy hashed password from the jupyter_notebook_config.json c.NotebookApp.password = u'<your -hashed password here>' c.NotebookApp.port = 9999 c.NotebookApp.allow_remote_access = True -``` - -Note: `<path-to-cert>` - path to key and certificate files, for example: -(`/home/\<username>/mycert.pem`) - -### Slurm job file to run the Jupyter server on Taurus with GPU (1x K80) (also works on K20) - -```Bash -#!/bin/bash -l #SBATCH --gres=gpu:1 # request GPU #SBATCH --partition=gpu2 # use GPU partition -SBATCH --output=notebook_output.txt #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --time=02:30:00 -SBATCH --mem=4000M #SBATCH -J "jupyter-notebook" # job-name #SBATCH -A <name_of_your_project> - -unset XDG_RUNTIME_DIR # might be required when interactive instead of sbatch to avoid -'Permission denied error' srun jupyter notebook -``` - -Start the script above (e.g. with the name jnotebook) with sbatch command: - -```Bash -sbatch jnotebook.slurm -``` - -If you have a question about sbatch script see the article about [Slurm](../jobs_and_resources/slurm.md). - -Check by the command: `tail notebook_output.txt` the status and the **token** of the server. It -should look like this: - -```Bash -https://(taurusi2092.taurus.hrsk.tu-dresden.de or 127.0.0.1):9999/ -``` - -You can see the **server node's hostname** by the command: `squeue -u <username>`. - -Remote connect to the server - -There are two options on how to connect to the server: - -1. You can create an ssh tunnel if you have problems with the -solution above. Open the other terminal and configure ssh -tunnel: (look up connection values in the output file of Slurm job, e.g.) (recommended): - -```Bash -node=taurusi2092 #see the name of the node with squeue -u <your_login> -localport=8887 #local port on your computer remoteport=9999 -#pay attention on the value. It should be the same value as value in the notebook_output.txt ssh --fNL ${localport}:${node}:${remoteport} <zih_user>@taurus.hrsk.tu-dresden.de #configure -of the ssh tunnel for connection to your remote server pgrep -f "ssh -fNL ${localport}" -#verify that tunnel is alive -``` - -2. On your client (local machine) you now can connect to the server. You need to know the **node's - hostname**, the **port** of the server and the **token** to login (see paragraph above). - -You can connect directly if you know the IP address (just ping the node's hostname while logged on -Taurus). - -```Bash -#comand on remote terminal taurusi2092$> host taurusi2092 # copy IP address from output # paste -IP to your browser or call on local terminal e.g. local$> firefox https://<IP>:<PORT> # https -important to use SSL cert -``` - -To login into the Jupyter notebook site, you have to enter the **token**. -(`https://localhost:8887`). Now you can create and execute notebooks on Taurus with GPU support. - -If you would like to use [JupyterHub](../access/jupyterhub.md) after using a remote manually configured -Jupyter server (example above) you need to change the name of the configuration file -(`/home//.jupyter/jupyter_notebook_config.py`) to any other. - -### F.A.Q - -**Q:** - I have an error to connect to the Jupyter server (e.g. "open failed: administratively -prohibited: open failed") - -**A:** - Check the settings of your Jupyter config file. Is it all necessary lines uncommented, the -right path to cert and key files, right hashed password from .json file? Check is the used local -port [available](https://en.wikipedia.org/wiki/List_of_TCP_and_UDP_port_numbers) -Check local settings e.g. (`/etc/ssh/sshd_config`, `/etc/hosts`). - -**Q:** I have an error during the start of the interactive session (e.g. PMI2_Init failed to -initialize. Return code: 1) - -**A:** Probably you need to provide `--mpi=none` to avoid ompi errors (). -`srun --mpi=none --reservation \<...> -A \<...> -t 90 --mem=4000 --gres=gpu:1 ---partition=gpu2-interactive --pty bash -l` diff --git a/doc.zih.tu-dresden.de/docs/archive/hardware.md b/doc.zih.tu-dresden.de/docs/archive/hardware.md deleted file mode 100644 index 624b9b745fcd6adb67bb8984f8d0f648c8224faf..0000000000000000000000000000000000000000 --- a/doc.zih.tu-dresden.de/docs/archive/hardware.md +++ /dev/null @@ -1,17 +0,0 @@ -# Hardware - -Here, you can find basic information about the hardware installed at ZIH. We try to keep this list -up-to-date. - -- [BULL HPC-Cluster Taurus](taurus_ii.md) -- [SGI Ultraviolet (UV)](hardware_venus.md) - -Hardware hosted by ZIH: - -Former systems - -- [PC-Farm Deimos](hardware_deimos.md) -- [SGI Altix](hardware_altix.md) -- [PC-Farm Atlas](hardware_atlas.md) -- [PC-Cluster Triton](hardware_triton.md) -- [HPC-Windows-Cluster Titan](hardware_titan.md) diff --git a/doc.zih.tu-dresden.de/docs/archive/hardware_altix.md b/doc.zih.tu-dresden.de/docs/archive/hardware_altix.md deleted file mode 100644 index 202ab10bda1d8829ede7a1fc52da9bf6db292a78..0000000000000000000000000000000000000000 --- a/doc.zih.tu-dresden.de/docs/archive/hardware_altix.md +++ /dev/null @@ -1,87 +0,0 @@ -# HPC Component SGI Altix - -The SGI Altix 4700 is a shared memory system with dual core Intel -Itanium 2 CPUs (Montecito) operated by the Linux operating system SuSE -SLES 10 with a 2.6 kernel. Currently, the following Altix partitions are -installed at ZIH: - -|Name|Total Cores|Compute Cores|Memory per Core| -|:----|:----|:----|:----| -| Mars |384 |348 |1 GB| -|Jupiter |512 |506 |4 GB| -|Saturn |512 |506 |4 GB| -|Uranus |512 |506|4 GB| -|Neptun |128 |128 |1 GB| - -The jobs for these partitions (except Neptun) are scheduled by the [Platform LSF](platform_lsf.md) -batch system running on `mars.hrsk.tu-dresden.de`. The actual placement of a submitted job may -depend on factors like memory size, number of processors, time limit. - -## Filesystems - -All partitions share the same CXFS filesystems `/work` and `/fastfs`. - -## ccNuma Architecture - -The SGI Altix has a ccNUMA architecture, which stands for Cache Coherent Non-Uniform Memory Access. -It can be considered as a SM-MIMD (*shared memory - multiple instruction multiple data*) machine. -The SGI ccNuma system has the following properties: - -- Memory is physically distributed but logically shared -- Memory is kept coherent automatically by hardware. -- Coherent memory: memory is always valid (caches hold copies) -- Granularity is L3 cacheline (128 B) -- Bandwidth of NumaLink4 is 6.4 GB/s - -The ccNuma is a compromise between a distributed memory system and a flat symmetric multi processing -machine (SMP). Altough the memory is shared, the access properties are not the same. - -## Compute Module - -The basic compute module of an Altix system is shown below. - -| | -|---------------------------------------------------------------------------------------------------------------------------------------------------------------| -| \<img src="%ATTACHURLPATH%/altix_brick_web.png" alt="altix_brick_web.png" width='312' height='192' />\<CAPTION ALIGN="BOTTOM">Altix compute blade \</CAPTION> | - -It consists of one dual core Intel Itanium 2 "Montecito" processor, the -local memory of 4 GB (2 GB on `Mars`), and the communication component, -the so-called SHUB. All resources are shared by both cores. They have a -common front side bus, so that accumulated memory bandwidth for both is -not higher than for just one core. - -The SHUB connects local and remote ressources. Via the SHUB and NUMAlink -all CPUs can access remote memory in the whole system. Naturally, the -fastest access provides local memory. There are some hints and commands -that may help you to get optimal memory allocation and process placement -). Four of these blades are grouped together with a NUMA router in a -compute brick. All bricks are connected with NUMAlink4 in a -"fat-tree"-topology. - -| | -|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| \<img src="%ATTACHURLPATH%/memory_access_web.png" alt="memory_access_web.png" width='450' />\<CAPTION align="bottom">Remote memory access via SHUBs and NUMAlink \</CAPTION> | - -## CPU - -The current SGI Altix is based on the dual core Intel Itanium 2 -processor (codename "Montecito"). One core has the following basic -properties: - -| | | -|-------------------------------------|----------------------------| -| clock rate | 1.6 GHz | -| integer units | 6 | -| floating point units (multiply-add) | 2 | -| peak performance | 6.4 GFLOPS | -| L1 cache | 2 x 16 kB, 1 clock latency | -| L2 cache | 256 kB, 5 clock latency | -| L3 cache | 9 MB, 12 clock latency | -| front side bus | 128 bit x 200 MHz | - -The theoretical peak performance of all Altix partitions is hence about 13.1 TFLOPS. - -The processor has hardware support for efficient software pipelining. For many scientific -applications it provides a high sustained performance exceeding the performance of RISC CPUs with -similar peak performance. On the down side is the fact that the compiler has to explicitely discover -and exploit the parallelism in the application. diff --git a/doc.zih.tu-dresden.de/docs/archive/hardware_atlas.md b/doc.zih.tu-dresden.de/docs/archive/hardware_atlas.md deleted file mode 100644 index 62a81ae538fcc40a1664483e1d5353b57ac3e6d1..0000000000000000000000000000000000000000 --- a/doc.zih.tu-dresden.de/docs/archive/hardware_atlas.md +++ /dev/null @@ -1,46 +0,0 @@ -# MEGWARE PC-Farm Atlas - -The PC farm `Atlas` is a heterogenous cluster based on multicore chips -AMD Opteron 6274 ("Bulldozer"). The nodes are operated by the Linux -operating system SuSE SLES 11 with a 2.6 kernel. Currently, the -following hardware is installed: - -| CPUs |AMD Opteron 6274 | -| number of cores | 5120 | -|th. peak performance | 45 TFlops | -|compute nodes | 4-way nodes *Saxonid* with 64 cores | -|nodes with 64 GB RAM | 48 | -|nodes with 128 GB RAM | 12 | -|nodes with 512 GB RAM | 8 | - -Mars and Deimos users: Please read the [migration hints](migrate_to_atlas.md). - -All nodes share the `/home` and `/fastfs` file system with our other HPC systems. Each -node has 180 GB local disk space for scratch mounted on `/tmp` . The jobs for the compute nodes are -scheduled by the [Platform LSF](platform_lsf.md) batch system from the login nodes -`atlas.hrsk.tu-dresden.de` . - -A QDR Infiniband interconnect provides the communication and I/O infrastructure for low latency / -high throughput data traffic. - -Users with a login on the [SGI Altix](hardware_altix.md) can access their home directory via NFS -below the mount point `/hpc_work`. - -## CPU AMD Opteron 6274 - -| Clock rate | 2.2 GHz | -| cores | 16 | -| L1 data cache | 16 KB per core | -| L1 instruction cache | 64 KB shared in a *module* (i.e. 2 cores) | -| L2 cache | 2 MB per module | -| L3 cache | 12 MB total, 6 MB shared between 4 modules = 8 cores | -| FP units | 1 per module (supports fused multiply-add) | -| th. peak performance | 8.8 GFlops per core (w/o turbo) | - -The CPU belongs to the x86_64 family. Since it is fully capable of -running x86-code, one should compare the performances of the 32 and 64 -bit versions of the same code. - -For more architectural details, see the -[AMD Bulldozer block diagram](http://upload.wikimedia.org/wikipedia/commons/e/ec/AMD_Bulldozer_block_diagram_%288_core_CPU%29.PNG) -and [topology of Atlas compute nodes] **todo** %ATTACHURL%/Atlas_Knoten.pdf. diff --git a/doc.zih.tu-dresden.de/docs/archive/hardware_venus.md b/doc.zih.tu-dresden.de/docs/archive/hardware_venus.md deleted file mode 100644 index be90985eace893cbf28753d5fbd2463402338e67..0000000000000000000000000000000000000000 --- a/doc.zih.tu-dresden.de/docs/archive/hardware_venus.md +++ /dev/null @@ -1,20 +0,0 @@ -# SGI UV2000 (venus) - -The SGI UV2000 is a shared memory system based on Intel Sandy Bridge -processors. It is operated by the Linux operating system SLES 11 SP 3 -with a kernel version 3.x. - -| | | -|----------------------------|-------| -| Number of CPU sockets | 64 | -| Physical cores per sockets | 8 | -| Total number of cores | 512 | -| Total memory | 8 TiB | - -From our experience, most parallel applications benefit from using the -additional hardware hyperthreads. - -## Filesystems - -Venus uses the same HOME file system as all our other HPC installations. -For computations, please use `/scratch`. diff --git a/doc.zih.tu-dresden.de/docs/archive/install_jupyter.md b/doc.zih.tu-dresden.de/docs/archive/install_jupyter.md new file mode 100644 index 0000000000000000000000000000000000000000..3c22335d875e36db29bc682ca0d4c8af95685f5e --- /dev/null +++ b/doc.zih.tu-dresden.de/docs/archive/install_jupyter.md @@ -0,0 +1,181 @@ +# Jupyter Installation + +Jupyter notebooks are a great way for interactive computing in your web browser. Jupyter allows +working with data cleaning and transformation, numerical simulation, statistical modeling, data +visualization and of course with machine learning. + +There are two general options on how to work Jupyter notebooks using HPC: remote Jupyter server and +JupyterHub. + +These sections show how to run and set up a remote Jupyter server within a sbatch GPU job and which +modules and packages you need for that. + +**Note:** On ZIH system, there is a [JupyterHub](../access/jupyterhub.md), where you do not need the +manual server setup described below and can simply run your Jupyter notebook on HPC nodes. Keep in +mind, that, with JupyterHub, you can't work with some special instruments. However, general data +analytics tools are available. + +The remote Jupyter server is able to offer more freedom with settings and approaches. + +## Preparation phase (optional) + +On ZIH system, start an interactive session for setting up the +environment: + +```Bash +srun --pty -n 1 --cpus-per-task=2 --time=2:00:00 --mem-per-cpu=2500 --x11=first bash -l -i +``` + +Create a new directory in your home, e.g. Jupyter + +```Bash +mkdir Jupyter cd Jupyter +``` + +There are two ways how to run Anaconda. The easiest way is to load the Anaconda module. The second +one is to download Anaconda in your home directory. + +1. Load Anaconda module (recommended): + +```Bash +module load modenv/scs5 module load Anaconda3 +``` + +1. Download latest Anaconda release (see example below) and change the rights to make it an +executable script and run the installation script: + +```Bash +wget https://repo.continuum.io/archive/Anaconda3-2019.03-Linux-x86_64.sh chmod 744 +Anaconda3-2019.03-Linux-x86_64.sh ./Anaconda3-2019.03-Linux-x86_64.sh + +(during installation you have to confirm the license agreement) +``` + +Next step will install the anaconda environment into the home +directory (`/home/userxx/anaconda3`). Create a new anaconda environment with the name `jnb`. + +```Bash +conda create --name jnb +``` + +## Set environmental variables + +In shell activate previously created python environment (you can +deactivate it also manually) and install Jupyter packages for this python environment: + +```Bash +source activate jnb conda install jupyter +``` + +If you need to adjust the configuration, you should create the template. Generate configuration +files for Jupyter notebook server: + +```Bash +jupyter notebook --generate-config +``` + +Find a path of the configuration file, usually in the home under `.jupyter` directory, e.g. +`/home//.jupyter/jupyter_notebook_config.py` + +Set a password (choose easy one for testing), which is needed later on to log into the server +in browser session: + +```Bash +jupyter notebook password Enter password: Verify password: +``` + +You get a message like that: + +```Bash +[NotebookPasswordApp] Wrote *hashed password* to +/home/<zih_user>/.jupyter/jupyter_notebook_config.json +``` + +I order to create a certificate for secure connections, you can create a self-signed +certificate: + +```Bash +openssl req -x509 -nodes -days 365 -newkey rsa:1024 -keyout mykey.key -out mycert.pem +``` + +Fill in the form with decent values. + +Possible entries for your Jupyter configuration (`.jupyter/jupyter_notebook*config.py*`). + +```Bash +c.NotebookApp.certfile = u'<path-to-cert>/mycert.pem' c.NotebookApp.keyfile = +u'<path-to-cert>/mykey.key' + +# set ip to '*' otherwise server is bound to localhost only c.NotebookApp.ip = '*' +c.NotebookApp.open_browser = False + +# copy hashed password from the jupyter_notebook_config.json c.NotebookApp.password = u'<your +hashed password here>' c.NotebookApp.port = 9999 c.NotebookApp.allow_remote_access = True +``` + +Note: `<path-to-cert>` - path to key and certificate files, for example: +(`/home/\<username>/mycert.pem`) + +## Slurm job file to run the Jupyter server on ZIH system with GPU (1x K80) (also works on K20) + +```Bash +#!/bin/bash -l #SBATCH --gres=gpu:1 # request GPU #SBATCH --partition=gpu2 # use GPU partition +SBATCH --output=notebook_output.txt #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --time=02:30:00 +SBATCH --mem=4000M #SBATCH -J "jupyter-notebook" # job-name #SBATCH -A <name_of_your_project> + +unset XDG_RUNTIME_DIR # might be required when interactive instead of sbatch to avoid +'Permission denied error' srun jupyter notebook +``` + +Start the script above (e.g. with the name `jnotebook`) with sbatch command: + +```Bash +sbatch jnotebook.slurm +``` + +If you have a question about sbatch script see the article about [Slurm](../jobs_and_resources/slurm.md). + +Check by the command: `tail notebook_output.txt` the status and the **token** of the server. It +should look like this: + +```Bash +https://(taurusi2092.taurus.hrsk.tu-dresden.de or 127.0.0.1):9999/ +``` + +You can see the **server node's hostname** by the command: `squeue -u <username>`. + +Remote connect to the server + +There are two options on how to connect to the server: + +1. You can create an ssh tunnel if you have problems with the +solution above. Open the other terminal and configure ssh +tunnel: (look up connection values in the output file of Slurm job, e.g.) (recommended): + +```Bash +node=taurusi2092 #see the name of the node with squeue -u <your_login> +localport=8887 #local port on your computer remoteport=9999 +#pay attention on the value. It should be the same value as value in the notebook_output.txt ssh +-fNL ${localport}:${node}:${remoteport} <zih_user>@taurus.hrsk.tu-dresden.de #configure +of the ssh tunnel for connection to your remote server pgrep -f "ssh -fNL ${localport}" +#verify that tunnel is alive +``` + +2. On your client (local machine) you now can connect to the server. You need to know the **node's + hostname**, the **port** of the server and the **token** to login (see paragraph above). + +You can connect directly if you know the IP address (just ping the node's hostname while logged on +ZIH system). + +```Bash +#comand on remote terminal taurusi2092$> host taurusi2092 # copy IP address from output # paste +IP to your browser or call on local terminal e.g. local$> firefox https://<IP>:<PORT> # https +important to use SSL cert +``` + +To login into the Jupyter notebook site, you have to enter the **token**. +(`https://localhost:8887`). Now you can create and execute notebooks on ZIH system with GPU support. + +If you would like to use [JupyterHub](../access/jupyterhub.md) after using a remote manually configured +Jupyter server (example above) you need to change the name of the configuration file +(`/home//.jupyter/jupyter_notebook_config.py`) to any other. diff --git a/doc.zih.tu-dresden.de/docs/archive/load_leveler.md b/doc.zih.tu-dresden.de/docs/archive/load_leveler.md index fb85aaf079e6769005a461ee226f5329210feb69..07daea3dbcef9d375a57f47dbec1d0d8a27d0491 100644 --- a/doc.zih.tu-dresden.de/docs/archive/load_leveler.md +++ b/doc.zih.tu-dresden.de/docs/archive/load_leveler.md @@ -1,10 +1,17 @@ # LoadLeveler - IBM Tivoli Workload Scheduler +!!! warning + + This page is deprecated. + ## Job Submission First of all, to submit a job to LoadLeveler a job file needs to be created. This job file can be passed to the command: -`llsubmit [llsubmit_options] <job_file>` + +``` +llsubmit [llsubmit_options] <job_file> +``` ### Job File Examples @@ -29,7 +36,7 @@ An example job file may look like this: ``` This example requests a serial job with a runtime of 30 minutes and a -overall memory requirement of 1GByte. There are four groups available, +overall memory requirement of 1 GB. There are four groups available, don't forget to choose the one and only matching group. When the job completes, a mail will be sent which includes details about resource usage. @@ -58,9 +65,9 @@ mpirun -x OMP_NUM_THREADS=1 -x LD_LIBRARY_PATH -np 16 ./my_mpi_program ``` This example requests a parallel job with 16 processes (2 nodes, 8 tasks -per node), a runtime of 30 minutes, 1GByte memory requirement per task -and therefore a overall memory requirement of 8GByte per node. Please -keep in mind that each node on Triton only provides 45GByte. The choice +per node), a runtime of 30 minutes, 1 GB memory requirement per task +and therefore a overall memory requirement of 8 GB per node. Please +keep in mind that each node on Triton only provides 45 GB. The choice of the correct group is also important and necessary. The `-x` option of `mpirun` exports the specified environment variables to all MPI processes. @@ -105,10 +112,10 @@ mpirun -x OMP_NUM_THREADS=8 -x LD_LIBRARY_PATH -np 4 --bynode ./my_hybrid_progra ``` This example requests a parallel job with 32 processes (4 nodes, 8 tasks -per node), a runtime of 30 minutes, 1GByte memory requirement per task -and therefore a overall memory requirement of 8GByte per node. Please -keep in mind that each node on Triton only provides 45GByte. The choice -of the correct group is also important and necessary. The mpirun command +per node), a runtime of 30 minutes, 1 GB memory requirement per task +and therefore a overall memory requirement of 8 GB per node. Please +keep in mind that each node on Triton only provides 45 GB. The choice +of the correct group is also important and necessary. The `mpirun` command starts 4 MPI processes (`--bynode` forces one process per node). `OMP_NUM_THREADS` is set to 8, so that 8 threads are started per MPI rank. When the job completes, a mail will be sent which includes details @@ -119,14 +126,14 @@ about resource usage. | Keyword | Valid values | Description | |:-------------------|:------------------------------------------------|:-------------------------------------------------------------------------------------| | `notification` | `always`, `error`, `start`, `never`, `complete` | When to write notification email. | -| `notify_user` | valid email adress | Notification email adress. | +| `notify_user` | valid email address | Notification email address. | | `output` | file name | File for stdout of the job. | | `error` | file name | File for stderr of the job. | | `job_type` | `parallel`, `serial` | Job type, default is `serial`. | | `node` | `1` - `64` | Number of nodes requested (parallel jobs only). | | `tasks_per_node` | `1` - `8` | Number of processors per node requested (parallel jobs only). | | `class` | see `llclass` | Job queue. | -| `group` | triton-ww, triton-ipf, triton-ism, triton-et | choose matching group | +| `group` | `triton-ww`, `triton-ipf`, `triton-ism`, `triton-et` | choose matching group | | `wall_clock_limit` | HH:MM:SS | Run time limit of the job. | | `resources` | `name(count)` ... `name(count)` | Specifies quantities of the consumable resources consumed by each task of a job step | @@ -139,43 +146,46 @@ description of keywords\]\]. Submission of a job without a job file can be done by the command: `llsub [llsub_options] <command>` -This command is not part of the IBM Loadleveler software but was -developed at ZIH. +This command is not part of the IBM LoadLeveler software but was developed at ZIH. -The job file will be created in background by means of the command line -options. Afterwards, the job file will be passed to the command -`llsubmit` which submit the job to LoadLeveler (see above). +The job file will be created in background by means of the command line options. Afterwards, the job +file will be passed to the command `llsubmit` which submit the job to LoadLeveler (see above). Important options are: -| Option | Default | Description | -|:----------------------|:---------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `-J <name>` | `llsub` | Specifies the name of the job. You can name the job using any combination of letters, numbers, or both. The job name only appears in the long reports of the llq, llstatus, and llsummary commands. | -| `-n` | `1` | Specifies the total number of tasks of a parallel job you want to run on all available nodes. | -| `-T` | not specified | Specifies the maximum number of OpenMP threads to use per process by setting the environment variable OMP_NUM_THREADS to number. | -| `--o, -oo <filename>` | `<jobname>.<hostname>.<jobid>.out` | Specifies the name of the file to use as standard output (stdout) when your job step runs. | -| `-e, -oe <filename>` | `<jobname>.<hostname>.<jobid>.err` | Specifies the name of the file to use as standard error (stderr) when your job step runs. | -| `-I` | not specified | Submits an interactive job and sends the job's standard output (or standard error) to the terminal. | -| `-q <name>` | non-interactive: `short` interactive(n`1): =interactive` interactive(n>1): `interactive_par` | Specifies the name of a job class defined locally in your cluster. You can use the llclass command to find out information on job classes. | -| `-x` | not specified | Puts the node running your job into exclusive execution mode. In exclusive execution mode, your job runs by itself on a node. It is dispatched only to a node with no other jobs running, and LoadLeveler does not send any other jobs to the node until the job completes. | -| `-hosts <number>` | automatically | Specifies the number of nodes requested by a job step. This option is equal to the bsub option -R "span\[hosts=number\]". | -| `-ptile <number>` | automatically | Specifies the number of nodes requested by a job step. This option is equal to the bsub option -R "span\[ptile=number\]". | -| `-mem <size>` | not specified | Specifies the requirement of memory which the job needs on a single node. The memory requirement is specified in MB. This option is equal to the bsub option -R "rusage\[mem=size\]". | +| Option | Default | Description | +|:----------------------|:-------------|:------------| +| `-J <name>` | `llsub` | Specifies the name of the job. You can name the job using any combination of letters, numbers, or both. The job name only appears in the long reports of the `llq`, `llstatus`, and `llsummary` commands. | +| `-n` | `1` | Specifies the total number of tasks of a parallel job you want to run on all available nodes. | +| `-T` | not specified | Specifies the maximum number of OpenMP threads to use per process by setting the environment variable `OMP_NUM_THREADS` to number. | +| `--o, -oo <filename>` | `<jobname>.<hostname>.<jobid>.out` | Specifies the name of the file to use as standard output (stdout) when your job step runs. | +| `-e, -oe <filename>` | `<jobname>.<hostname>.<jobid>.err` | Specifies the name of the file to use as standard error (stderr) when your job step runs. | +| `-I` | not specified | Submits an interactive job and sends the job's standard output (or standard error) to the terminal. | +| `-q <name>` | non-interactive: `short` interactive(n`1): =interactive` interactive(n>1): `interactive_par` | Specifies the name of a job class defined locally in your cluster. You can use the `llclass` command to find out information on job classes. | +| `-x` | not specified | Puts the node running your job into exclusive execution mode. In exclusive execution mode, your job runs by itself on a node. It is dispatched only to a node with no other jobs running, and LoadLeveler does not send any other jobs to the node until the job completes. | +| `-hosts <number>` | automatically | Specifies the number of nodes requested by a job step. This option is equal to the bsub option `-R "span\[hosts=number\]"`. | +| `-ptile <number>` | automatically | Specifies the number of nodes requested by a job step. This option is equal to the bsub option `-R "span\[ptile=number\]"`. | +| `-mem <size>` | not specified | Specifies the requirement of memory which the job needs on a single node. The memory requirement is specified in MB. This option is equal to the bsub option `-R "rusage\[mem=size\]"`. | The option `-H` prints the list of all available command line options. Here is an example for an MPI Job: - llsub -T 1 -n 16 -e err.txt -o out.txt mpirun -x LD_LIBRARY_PATH -np 16 ./my_program +```console +llsub -T 1 -n 16 -e err.txt -o out.txt mpirun -x LD_LIBRARY_PATH -np 16 ./my_program +``` ### Interactive Jobs Interactive Jobs can be submitted by the command: -`llsub -I -q <interactive> <command>` -### Loadleveler Runtime Environment Variables +```console +llsub -I -q <interactive> <command> +``` + +### LoadLeveler Runtime Environment Variables -Loadleveler Runtime Variables give you some information within the job +LoadLeveler runtime variables give you some information within the job script, for example: ```Bash @@ -209,8 +219,8 @@ The `llclass` command provides information about each queue. Example output: ```Bash -Name MaxJobCPU MaxProcCPU Free Max Description - d+hh:mm:ss d+hh:mm:ss Slots Slots +Name MaxJobCPU MaxProcCPU Free Max Description + d+hh:mm:ss d+hh:mm:ss Slots Slots --------------- -------------- -------------- ----- ----- --------------------- interactive undefined undefined 32 32 interactive, exclusive shared nodes, max. 12h runtime triton_ism undefined undefined 8 80 exclusive, serial + parallel queue, nodes shared, unlimited runtime @@ -226,13 +236,13 @@ short undefined undefined 272 384 serial + parallel queu ```Bash # llq -```Bash +``` #### All of One's Own Jobs ```Bash # llq -u username -```Bash +``` ### Details About Why A Job Has Not Yet Started @@ -262,14 +272,14 @@ Total number of available initiators of this class on all machines in the cluste Minimum number of initiators of this class required by job step: 32 The number of available initiators of this class is not sufficient for this job step. Not enough resources to start now. -This step is top-dog. +This step is top-dog. Considered at: Fri Jul 13 12:12:04 2007 Will start by: Tue Jul 17 18:10:32 2007 ``` ### Generate a long listing rather than the standard one -```Bash +```console # llq -l job-id ``` @@ -277,41 +287,41 @@ This command will give you detailed job information. ### Job Status States -| | | | -|------------------|-----|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Canceled | CA | The job has been canceled as by the llcancel command. | -| Completed | C | The job has completed. | -| Complete Pending | CP | The job is completed. Some tasks are finished. | +| | | | +|------------------|-----|----------------| +| Canceled | CA | The job has been canceled as by the `llcancel` command. | +| Completed | C | The job has completed. | +| Complete Pending | CP | The job is completed. Some tasks are finished. | | Deferred | D | The job will not be assigned until a specified date. The start date may have been specified by the user in the Job Command file or it may have been set by LoadLeveler because a parallel job could not obtain enough machines to run the job. | -| Idle | I | The job is being considered to run on a machine though no machine has been selected yet. | -| NotQueued | NQ | The job is not being considered to run. A job may enter this state due to an error in the command file or because LoadLeveler can not obtain information that it needs to act on the request. | -| Not Run | NR | The job will never run because a stated dependency in the Job Command file evaluated to be false. | -| Pending | P | The job is in the process of starting on one or more machines. The request to start the job has been sent but has not yet been acknowledged. | -| Rejected | X | The job did not start because there was a mismatch or requirements for your job and the resources on the target machine or because the user does not have a valid ID on the target machine. | -| Reject Pending | XP | The job is in the process of being rejected. | -| Removed | RM | The job was canceled by either LoadLeveler or the owner of the job. | -| Remove Pending | RP | The job is in the process of being removed. | -| Running | R | The job is running. | -| Starting | ST | The job is starting. | -| Submission Error | SX | The job can not start due to a submission error. Please notify the Bluedawg administration team if you encounter this error. | -| System Hold | S | The job has been put in hold by a system administrator. | -| System User Hold | HS | Both the user and a system administrator has put the job on hold. | -| Terminated | TX | The job was terminated, presumably by means beyond LoadLeveler's control. Please notify the Bluedawg administration team if you encounter this error. | -| User Hold | H | The job has been put on hold by the owner. | -| Vacated | V | The started job did not complete. The job will be scheduled again provided that the job may be rescheduled. | -| Vacate Pending | VP | The job is in the process of vacating. | +| Idle | I | The job is being considered to run on a machine though no machine has been selected yet. | +| NotQueued | NQ | The job is not being considered to run. A job may enter this state due to an error in the command file or because LoadLeveler can not obtain information that it needs to act on the request. | +| Not Run | NR | The job will never run because a stated dependency in the Job Command file evaluated to be false. | +| Pending | P | The job is in the process of starting on one or more machines. The request to start the job has been sent but has not yet been acknowledged. | +| Rejected | X | The job did not start because there was a mismatch or requirements for your job and the resources on the target machine or because the user does not have a valid ID on the target machine. | +| Reject Pending | XP | The job is in the process of being rejected. | +| Removed | RM | The job was canceled by either LoadLeveler or the owner of the job. | +| Remove Pending | RP | The job is in the process of being removed. | +| Running | R | The job is running. | +| Starting | ST | The job is starting. | +| Submission Error | SX | The job can not start due to a submission error. Please notify the Bluedawg administration team if you encounter this error. | +| System Hold | S | The job has been put in hold by a system administrator. | +| System User Hold | HS | Both the user and a system administrator has put the job on hold. | +| Terminated | TX | The job was terminated, presumably by means beyond LoadLeveler's control. Please notify the Bluedawg administration team if you encounter this error. | +| User Hold | H | The job has been put on hold by the owner. | +| Vacated | V | The started job did not complete. The job will be scheduled again provided that the job may be rescheduled. | +| Vacate Pending | VP | The job is in the process of vacating. | ## Cancel a Job ### A Particular Job -```Bash +```console # llcancel job-id ``` ### All of One's Jobs -```Bash +```console # llcancel -u username ``` @@ -319,18 +329,18 @@ This command will give you detailed job information. On each cluster, there exists a file that contains the history of all jobs run under LoadLeveler. This file is -**/var/loadl/archive/history.archive**, and may be queried using the -**llsummary** command. +`/var/loadl/archive/history.archive`, and may be queried using the +`llsummary` command. An example of usage would be as follows: -```Bash +```console # llsummary -u estrabd /var/loadl/archive/history.archive ``` And the output would look something like: -```Bash +```console Name Jobs Steps Job Cpu Starter Cpu Leverage estrabd 118 128 07:55:57 00:00:45 634.6 TOTAL 118 128 07:55:57 00:00:45 634.6 @@ -346,83 +356,85 @@ interactive 105 105 04:46:24 00:00:26 660.9 TOTAL 118 128 07:55:57 00:00:45 634.6 ``` -The **llsummary** tool has a lot of options, which are discussed in its +The `llsummary` tool has a lot of options, which are discussed in its man pages. ## Check status of each node - # llstatus +```console +# llstatus +``` And the output would look something like: -```Bash +```console root@triton[0]:~# llstatus -Name Schedd InQ Act Startd Run LdAvg Idle Arch OpSys -n01 Avail 0 0 Idle 0 0.00 2403 AMD64 Linux2 -n02 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n03 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n04 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n05 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n06 Avail 0 0 Idle 0 0.71 9999 AMD64 Linux2 -n07 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n08 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n09 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n10 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n11 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n12 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n13 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n14 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n15 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n16 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n17 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n18 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n19 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n20 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n21 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n22 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n23 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n24 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n25 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n26 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n27 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n28 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n29 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n30 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n31 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n32 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n33 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n34 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n35 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n36 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n37 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n38 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n39 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n40 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n41 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n42 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n43 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n44 Avail 0 0 Idle 0 0.01 9999 AMD64 Linux2 -n45 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n46 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n47 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n48 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n49 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n50 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n51 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n52 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n53 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n54 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n55 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n56 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n57 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n58 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n59 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n60 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n61 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n62 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n63 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -n64 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 -triton Avail 0 0 Idle 0 0.00 585 AMD64 Linux2 +Name Schedd InQ Act Startd Run LdAvg Idle Arch OpSys +n01 Avail 0 0 Idle 0 0.00 2403 AMD64 Linux2 +n02 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n03 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n04 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n05 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n06 Avail 0 0 Idle 0 0.71 9999 AMD64 Linux2 +n07 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n08 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n09 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n10 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n11 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n12 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n13 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n14 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n15 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n16 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n17 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n18 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n19 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n20 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n21 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n22 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n23 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n24 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n25 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n26 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n27 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n28 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n29 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n30 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n31 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n32 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n33 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n34 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n35 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n36 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n37 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n38 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n39 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n40 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n41 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n42 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n43 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n44 Avail 0 0 Idle 0 0.01 9999 AMD64 Linux2 +n45 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n46 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n47 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n48 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n49 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n50 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n51 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n52 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n53 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n54 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n55 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n56 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n57 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n58 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n59 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n60 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n61 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n62 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n63 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +n64 Avail 0 0 Idle 0 0.00 9999 AMD64 Linux2 +triton Avail 0 0 Idle 0 0.00 585 AMD64 Linux2 AMD64/Linux2 65 machines 0 jobs 0 running tasks Total Machines 65 machines 0 jobs 0 running tasks @@ -436,7 +448,7 @@ All machines on the machine_list are present. Detailed status information for a specific node: -```Bash +```console # llstatus -l n54 ``` diff --git a/Compendium_attachments/HardwareAtlas/Atlas_Knoten.pdf b/doc.zih.tu-dresden.de/docs/archive/misc/Atlas_Knoten.pdf similarity index 100% rename from Compendium_attachments/HardwareAtlas/Atlas_Knoten.pdf rename to doc.zih.tu-dresden.de/docs/archive/misc/Atlas_Knoten.pdf diff --git a/Compendium_attachments/HardwareAltix/altix_brick_web.png b/doc.zih.tu-dresden.de/docs/archive/misc/altix_brick_web.png similarity index 100% rename from Compendium_attachments/HardwareAltix/altix_brick_web.png rename to doc.zih.tu-dresden.de/docs/archive/misc/altix_brick_web.png diff --git a/Compendium_attachments/HardwareAltix/memory_access_web.png b/doc.zih.tu-dresden.de/docs/archive/misc/memory_access_web.png similarity index 100% rename from Compendium_attachments/HardwareAltix/memory_access_web.png rename to doc.zih.tu-dresden.de/docs/archive/misc/memory_access_web.png diff --git a/doc.zih.tu-dresden.de/docs/archive/platform_lsf.md b/doc.zih.tu-dresden.de/docs/archive/platform_lsf.md index 1be15a0a9beb204c188b339e00c487e6ebbd5af0..e0c0b95764618c0585e4eaf07fcac3e1f3bb947e 100644 --- a/doc.zih.tu-dresden.de/docs/archive/platform_lsf.md +++ b/doc.zih.tu-dresden.de/docs/archive/platform_lsf.md @@ -1,6 +1,8 @@ # Platform LSF -**This Page is deprecated!** The current bachsystem on Taurus is [Slurm][../jobs_and_resources/slurm.md] +!!! warning + This Page is deprecated! + The current bachsystem on ZIH systems is [Slurm](../jobs_and_resources/slurm.md). The HRSK-I systems are operated with the batch system LSF running on *Mars*, *Atlas* resp.. @@ -11,26 +13,26 @@ The job submission can be done with the command: Some options of `bsub` are shown in the following table: -| bsub option | Description | -|:-------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `-n \<N> ` | set number of processors (cores) to N(default=1) | -| `-W \<hh:mm> ` | set maximum wall clock time to `<hh:mm>` | -| `-J \<name> ` | assigns the specified name to the job | -| `-eo \<errfile> ` | writes the standard error output of the job to the specified file (overwriting) | -| `-o \<outfile> ` | appends the standard output of the job to the specified file | -| `-R span\[hosts=1\]` | use only one SMP node (automatically set by the batch system) | -| `-R span\[ptile=2\]` | run 2 tasks per node | -| `-x ` | disable other jobs to share the node ( Atlas ). | -| `-m ` | specify hosts to run on ( [see below](#HostList)) | -| `-M \<M> ` | specify per-process (per-core) memory limit (in MB), the job's memory limit is derived from that number (N proc \* M MB); see examples and [Attn. #2](#AttentionNo2) below | -| `-P \<project> ` | specifiy project | +| Bsub Option | Description | +|:-------------------|:------------| +| `-n <N>` | set number of processors (cores) to N(default=1) | +| `-W <hh:mm>` | set maximum wall clock time to `<hh:mm>` | +| `-J <name>` | assigns the specified name to the job | +| `-eo <errfile>` | writes the standard error output of the job to the specified file (overwriting) | +| `-o <outfile>` | appends the standard output of the job to the specified file | +| `-R span[hosts=1]` | use only one SMP node (automatically set by the batch system) | +| `-R span[ptile=2]` | run 2 tasks per node | +| `-x` | disable other jobs to share the node ( Atlas ). | +| `-m` | specify hosts to run on ( [see below](#host-list)) | +| `-M <M>` | specify per-process (per-core) memory limit (in MB), the job's memory limit is derived from that number (N proc * M MB); see examples below | +| `-P <project>` | specify project | You can use the `%J` -macro to merge the job ID into names. It might be more convenient to put the options directly in a job file which you can submit using -```Bash +```console bsub <my_jobfile> ``` @@ -44,7 +46,7 @@ The following example job file shows how you can make use of it: #BSUB -n 4 # number of processors #BSUB -M 500 # 500MB per core memory limit #BSUB -o out.%J # output file -#BSUB -u name@tu-dresden.de # email address; works ONLY with @tu-dresden.de +#BSUB -u name@tu-dresden.de # email address; works ONLY with @tu-dresden.de echo Starting Program cd $HOME/work @@ -52,21 +54,21 @@ a.out # e.g. an OpenMP program echo Finished Program ``` -**Understanding memory limits** The option -M to bsub defines how much +**Understanding memory limits** The option `-M` to bsub defines how much memory may be consumed by a single process of the job. The job memory limit is computed taking this value times the number of processes -requested (-n). Therefore, having -M 600 and -n 4 results in a job +requested (`-n`). Therefore, having `-M 600` and `-n 4` results in a job memory limit of 2400 MB. If any one of your processes consumes more than 600 MB memory OR if all processes belonging to this job consume more than 2400 MB of memory in sum, then the job will be killed by LSF. -- For serial programs, the given limit is the same for the process and - the whole job, e.g. 500 MB `bsub -W 1:00 -n 1 -M 500 myprog` -- For MPI-parallel programs, the job memory limit is N processes \* - memory limit, e.g. 32\*800 MB = 25600 MB `bsub -W 8:00 -n 32 -M 800 mympiprog` -- For OpenMP-parallel programs, the same applies as with MPI-parallel - programs, e.g. 8\*2000 MB = 16000 MB - `bsub -W 4:00 -n 8 -M 2000 myompprog` +- For serial programs, the given limit is the same for the process and + the whole job, e.g. 500 MB `bsub -W 1:00 -n 1 -M 500 myprog` +- For MPI-parallel programs, the job memory limit is N processes \* + memory limit, e.g. 32*800 MB = 25600 MB `bsub -W 8:00 -n 32 -M 800 mympiprog` +- For OpenMP-parallel programs, the same applies as with MPI-parallel + programs, e.g. 8*2000 MB = 16000 MB + `bsub -W 4:00 -n 8 -M 2000 myompprog` LSF sets the user environment according to the environment at the time of submission. @@ -78,14 +80,13 @@ of a job placement in a queue is therefore the ratio between used and granted CP period. **Attention:** If you do not give the maximum runtime of your program, the -default runtime for the specified queue is taken. This is way below the -maximal possible runtime (see table [below](#JobQueues)). +default runtime for the specified queue is taken. **Attention 2:** Some systems enforce a limit on how much memory each process and your job as a -whole may allocate. If your job or any of its processes exceed this limit (N proc.\*limit for the +whole may allocate. If your job or any of its processes exceed this limit (N proc.*limit for the job), your job will be killed. If memory limiting is in place, there also exists a default limit which will be applied to your job if you do not specify one. Please find the limits along with the -description of the machines' [queues](#JobQueues) below. +description of the machines' [queues](#job-queues) below. ### Interactive Jobs @@ -98,52 +99,45 @@ extensive production runs! Use the bsub options `-Is` for an interactive and, additionally on *Atlas*, `-XF` for an X11 job like: -```Bash +```console bsub -Is -XF matlab ``` or for an interactive job with a bash use -```Bash -bsub -Is -n 2 -W <hh:mm> -P <project> bash +```console +bsub -Is -n 2 -W <hh:mm> -P <project> bash ``` You can check the current usage of the system with the command `bhosts` to estimate the time to schedule. -## ParallelJobs - ### Parallel Jobs -For submitting parallel jobs, a few rules have to be understood and -followed. In general they depend on the type of parallelization and the -architecture. +For submitting parallel jobs, a few rules have to be understood and followed. In general they depend +on the type of parallelization and the architecture. #### OpenMP Jobs -An SMP-parallel job can only run within a node (or a partition), so it -is necessary to include the option `-R "span[hosts=1]"` . The maximum -number of processors for an SMP-parallel program is 506 on a large Altix -partition, and 64 on *Atlas*. A simple example of a job file -for an OpenMP job can be found above (section [3.4](#LSF-OpenMP)). +An SMP-parallel job can only run within a node (or a partition), so it is necessary to include the +option `-R span[hosts=1]` . The maximum number of processors for an SMP-parallel program is 506 on a +large Altix partition, and 64 on *Atlas*. #### MPI Jobs -There are major differences for submitting MPI-parallel jobs on the -systems at ZIH. Please refer to the HPC systems's section. It is -essential to use the same modules at compile- and run-time. +There are major differences for submitting MPI-parallel jobs on the systems at ZIH. Please refer to +the HPC systems's section. It is essential to use the same modules at compile- and run-time. ### Array Jobs -Array jobs can be used to create a sequence of jobs that share the same -executable and resource requirements, but have different input files, to -be submitted, controlled, and monitored as a single unit. +Array jobs can be used to create a sequence of jobs that share the same executable and resource +requirements, but have different input files, to be submitted, controlled, and monitored as a single +unit. -After the job array is submitted, LSF independently schedules and -dispatches the individual jobs. Each job submitted from a job array -shares the same job ID as the job array and are uniquely referenced -using an array index. The dimension and structure of a job array is -defined when the job array is created. +After the job array is submitted, LSF independently schedules and dispatches the individual jobs. +Each job submitted from a job array shares the same job ID as the job array and are uniquely +referenced using an array index. The dimension and structure of a job array is defined when the job +array is created. Here is an example how an array job can looks like: @@ -151,20 +145,19 @@ Here is an example how an array job can looks like: #!/bin/bash #BSUB -W 00:10 -#BSUB -n 1 +#BSUB -n 1 #BSUB -J "myTask[1-100:2]" # create job array with 50 tasks #BSUB -o logs/out.%J.%I # appends the standard output of the job to the specified file that # contains the job information (%J) and the task information (%I) -#BSUB -e logs/err.%J.%I # appends the error output of the job to the specified file that +#BSUB -e logs/err.%J.%I # appends the error output of the job to the specified file that # contains the job information (%J) and the task information (%I) echo "Hello Job $LSB_JOBID Task $LSB_JOBINDEX" ``` -Alternatively, you can use the following single command line to submit -an array job: +Alternatively, you can use the following single command line to submit an array job: -```Bash +```console bsub -n 1 -W 00:10 -J "myTask[1-100:2]" -o "logs/out.%J.%I" -e "logs/err.%J.%I" "echo Hello Job \$LSB_JOBID Task \$LSB_JOBINDEX" ``` @@ -172,15 +165,13 @@ For further details please read the LSF manual. ### Chain Jobs -You can use chain jobs to create dependencies between jobs. This is -often the case if a job relies on the result of one or more preceding -jobs. Chain jobs can also be used if the runtime limit of the batch -queues is not sufficient for your job. +You can use chain jobs to create dependencies between jobs. This is often the case if a job relies +on the result of one or more preceding jobs. Chain jobs can also be used if the runtime limit of the +batch queues is not sufficient for your job. -To create dependencies between jobs you have to use the option `-w`. -Since `-w` relies on the job id or the job name it is advisable to use -the option `-J` to create a user specified name for a single job. For -detailed information see the man pages of bsub with `man bsub`. +To create dependencies between jobs you have to use the option `-w`. Since `-w` relies on the job +id or the job name it is advisable to use the option `-J` to create a user specified name for a +single job. For detailed information see the man pages of bsub with `man bsub`. Here is an example how a chain job can looks like: @@ -217,52 +208,47 @@ done ## Job Queues -With the command `bqueues [-l <queue name>]` you can get information -about available queues. With `bqueues -l` you get a detailed listing of -the queue properties. +With the command `bqueues [-l <queue name>]` you can get information about available queues. With +`bqueues -l` you get a detailed listing of the queue properties. -`Attention`: The queue `interactive` is the only one to accept -interactive jobs! +**Attention:** The queue `interactive` is the only one to accept interactive jobs! ## Job Monitoring -You can check the current usage of the system with the command `bhosts` -to estimate the time to schedule. Or to get an overview on *Atlas*, -lsfview shows the current usage of the system. +You can check the current usage of the system with the command `bhosts` to estimate the time to +schedule. Or to get an overview on *Atlas*, lsfview shows the current usage of the system. The command `bhosts` shows the load on the hosts. -For a more convenient overview the command `lsfshowjobs` displays -information on the LSF status like this: +For a more convenient overview the command `lsfshowjobs` displays information on the LSF status like +this: -```Bash +```console You have 1 running job using 64 cores You have 1 pending job ``` -and the command `lsfnodestat` displays the node and core status of -machine like this: +and the command `lsfnodestat` displays the node and core status of machine like this: -```Bash +```console # ------------------------------------------- nodes available: 714/714 nodes damaged: 0 # ------------------------------------------- -jobs running: 1797 \| cores closed (exclusive jobs): 94 jobs wait: 3361 -\| cores closed by ADMIN: 129 jobs suspend: 0 \| cores working: 2068 -jobs damaged: 0 \| +jobs running: 1797 | cores closed (exclusive jobs): 94 jobs wait: 3361 +| cores closed by ADMIN: 129 jobs suspend: 0 | cores working: 2068 +jobs damaged: 0 | # ------------------------------------------- normal working cores: 2556 cores free for jobs: 265 ``` -The command `bjobs` allows to monitor your running jobs. It has the -following options: +The command `bjobs` allows to monitor your running jobs. It has the following options: -| bjobs option | Description | +| Bjobs Option | Description | |:--------------|:----------------------------------------------------------------------------------------------------------------------------------| | `-r` | Displays running jobs. | | `-s` | Displays suspended jobs, together with the suspending reason that caused each job to become suspended. | @@ -270,26 +256,24 @@ following options: | `-a` | Displays information on jobs in all states, including finished jobs that finished recently. | | `-l [job_id]` | Displays detailed information for each job or for a particular job. | -## Checking the progress of your jobs +## Checking the Progress of Your Jobs -If you run code that regularily emits status or progress messages, using -the command +If you run code that regularly emits status or progress messages, using the command -```Bash +```console watch -n10 tail -n2 '*out' ``` -in your `$HOME/.lsbatch` directory is a very handy way to keep yourself -informed. Note that this only works if you did not use the `-o` option -of `bsub`, If you used `-o`, replace `*out` with the list of file names -you passed to this very option. +in your `$HOME/.lsbatch` directory is a very handy way to keep yourself informed. Note that this +only works if you did not use the `-o` option of `bsub`, If you used `-o`, replace `*out` with the +list of file names you passed to this very option. ## Host List -The `bsub` option `-m` can be used to specify a list of hosts for -execution. This is especially useful for memory intensive computations. +The `bsub` option `-m` can be used to specify a list of hosts for execution. This is especially +useful for memory intensive computations. ### Altix -Jupiter, saturn, and uranus have 4 GB RAM per core, mars only 1GB. So it -makes sense to specify '-m "jupiter saturn uranus". +Jupiter, Saturn, and Uranus have 4 GB RAM per core, mars only 1 GB. So it makes sense to specify +`-m "jupiter saturn uranus"`. diff --git a/doc.zih.tu-dresden.de/docs/archive/ram_disk_documentation.md b/doc.zih.tu-dresden.de/docs/archive/ram_disk_documentation.md deleted file mode 100644 index 2f0a6071dc7aa1ecbb3e9b48563f70d39b773d7c..0000000000000000000000000000000000000000 --- a/doc.zih.tu-dresden.de/docs/archive/ram_disk_documentation.md +++ /dev/null @@ -1,69 +0,0 @@ -# Ramdisk - -## Using parts of the main memory as a temporary file system - -On systems with a very large main memory, it is for some workloads very -attractive to use parts of the main memory as a temporary file system. -This will reduce file access times dramatically and has proven to speed -up applications that are otherwise limited by I/O. - -We provide tools to allow users to create and destroy their own -ramdisks. Currently, this is only allowed on the SGI UV2 (venus). Please -note that the content of the ramdisk will vanish immediatelly when the -ramdisk is destroyed or the machine crashes. Always copy out result data -written to the ramdisk to another location. - -### Creating a ramdisk - -On venus, the creation of ramdisks is only allowed from within an LSF -job. The memory used for the ramdisk will be deducted from the memory -assigned to the LSF job. Thus, the amount of memory available for an LSF -job determines the maximum size of the ramdisk. Per LSF job only a -single ramdisk can be created (but you can create and delete a ramdisk -multiple times during a job). You need to load the corresponding -software module via - -```Bash -module load ramdisk -``` - -Afterwards, the ramdisk can be created with the command - -```Bash -make-ramdisk «size of the ramdisk in GB» -``` - -The path to the ramdisk is fixed to `/ramdisks/«JOBID»`. - -### Putting data onto the ramdisk - -The ramdisk itself works like a normal file system or directory. We -provide a script that uses multiple threads to copy a directory tree. It -can also be used to transfer single files but will only use one thread -in this case. It is used as follows - -```Bash -parallel-copy.sh «source directory or file» «target directory» -``` - -It is not specifically tailored to be used with the ramdisk. It can be -used for any copy process between two locations. - -### Destruction of the ramdisk - -A ramdisk will automatically be deleted at the end of the job. As an -alternative, you can delete your own ramdisk via the command - -```Bash -kill-ramdisk -``` - -It is possible, that the deletion of the ramdisk fails. The reason for -this is typically that some process still has a file open within the -ramdisk or that there is still a program using the ramdisk or having the -ramdisk as its current path. Locating these processes, that block the -destruction of the ramdisk is possible via using the command - -```Bash -lsof +d /ramdisks/«JOBID» -``` diff --git a/doc.zih.tu-dresden.de/docs/archive/system_altix.md b/doc.zih.tu-dresden.de/docs/archive/system_altix.md index d3ebdbbe554d5aa3f7dcda460d4831974a589744..951b06137a599fc95239e5d50144fd2fa205e096 100644 --- a/doc.zih.tu-dresden.de/docs/archive/system_altix.md +++ b/doc.zih.tu-dresden.de/docs/archive/system_altix.md @@ -1,12 +1,98 @@ # SGI Altix -**This page is deprecated! The SGI Atlix is a former system!** +!!! warning -The SGI Altix is shared memory system for large parallel jobs using up to 2000 cores in parallel ( -[information on the hardware](hardware_altix.md)). It's partitions are Mars (login), Jupiter, Saturn, -Uranus, and Neptun (interactive). + **This page is deprecated! The SGI Altix is a former system!** -## Compiling Parallel Applications +## System + +The SGI Altix 4700 is a shared memory system with dual core Intel Itanium 2 CPUs (Montecito) +operated by the Linux operating system SUSE SLES 10 with a 2.6 kernel. Currently, the following +Altix partitions are installed at ZIH: + +|Name|Total Cores|Compute Cores|Memory per Core| +|:----|:----|:----|:----| +| Mars |384 |348 |1 GB| +|Jupiter |512 |506 |4 GB| +|Saturn |512 |506 |4 GB| +|Uranus |512 |506|4 GB| +|Neptun |128 |128 |1 GB| + +The jobs for these partitions (except Neptun) are scheduled by the [Platform LSF](platform_lsf.md) +batch system running on `mars.hrsk.tu-dresden.de`. The actual placement of a submitted job may +depend on factors like memory size, number of processors, time limit. + +### File Systems + +All partitions share the same CXFS file systems `/work` and `/fastfs`. + +### ccNUMA Architecture + +The SGI Altix has a ccNUMA architecture, which stands for *Cache Coherent Non-Uniform Memory Access*. +It can be considered as a SM-MIMD (*shared memory - multiple instruction multiple data*) machine. +The SGI ccNUMA system has the following properties: + +- Memory is physically distributed but logically shared +- Memory is kept coherent automatically by hardware. +- Coherent memory: memory is always valid (caches hold copies) +- Granularity is L3 cache line (128 B) +- Bandwidth of NUMAlink4 is 6.4 GB/s + +The ccNUMA is a compromise between a distributed memory system and a flat symmetric multi processing +machine (SMP). Although the memory is shared, the access properties are not the same. + +### Compute Module + +The basic compute module of an Altix system is shown below. + + +{: align="center"} + +It consists of one dual core Intel Itanium 2 "Montecito" processor, the +local memory of 4 GB (2 GB on `Mars`), and the communication component, +the so-called SHUB. All resources are shared by both cores. They have a +common front side bus, so that accumulated memory bandwidth for both is +not higher than for just one core. + +The SHUB connects local and remote resources. Via the SHUB and NUMAlink +all CPUs can access remote memory in the whole system. Naturally, the +fastest access provides local memory. There are some hints and commands +that may help you to get optimal memory allocation and process placement +). Four of these blades are grouped together with a NUMA router in a +compute brick. All bricks are connected with NUMAlink4 in a +"fat-tree"-topology. + +Remote memory access via SHUBs and NUMAlink + +{: align="center"} + +### CPU + +The current SGI Altix is based on the dual core Intel Itanium 2 +processor (code name "Montecito"). One core has the following basic +properties: + +| | | +|-------------------------------------|----------------------------| +| clock rate | 1.6 GHz | +| integer units | 6 | +| floating point units (multiply-add) | 2 | +| peak performance | 6.4 GFLOPS | +| L1 cache | 2 x 16 kB, 1 clock latency | +| L2 cache | 256 kB, 5 clock latency | +| L3 cache | 9 MB, 12 clock latency | +| front side bus | 128 bit x 200 MHz | + +The theoretical peak performance of all Altix partitions is hence about 13.1 TFLOPS. + +The processor has hardware support for efficient software pipelining. For many scientific +applications it provides a high sustained performance exceeding the performance of RISC CPUs with +similar peak performance. On the down side is the fact that the compiler has to explicitly discover +and exploit the parallelism in the application. + +## Usage + +### Compiling Parallel Applications This installation of the Message Passing Interface supports the MPI 1.2 standard with a few MPI-2 features (see `man mpi` ). There is no command like `mpicc`, instead you just have to use the normal @@ -16,32 +102,31 @@ additional library- or include-paths. - Note for C++ programmers: You need to link with `-lmpi++abi1002 -lmpi` instead of `-lmpi`. - Note for Fortran programmers: The MPI module is only provided for the Intel compiler and does not - work with gfortran. + work with `gfortran`. Please follow these following guidelines to run your parallel program using the batch system on Mars. -## Batch system +### Batch System -Applications on an HPC system can not be run on the login node. They -have to be submitted to compute nodes with dedicated resources for the -user's job. Normally a job can be submitted with these data: +Applications on an HPC system can not be run on the login node. They have to be submitted to compute +nodes with dedicated resources for the user's job. Normally a job can be submitted with these data: -- number of CPU cores, -- requested CPU cores have to belong on one node (OpenMP programs) or - can distributed (MPI), -- memory per process, -- maximum wall clock time (after reaching this limit the process is - killed automatically), -- files for redirection of output and error messages, -- executable and command line parameters. +- number of CPU cores, +- requested CPU cores have to belong on one node (OpenMP programs) or + can distributed (MPI), +- memory per process, +- maximum wall clock time (after reaching this limit the process is + killed automatically), +- files for redirection of output and error messages, +- executable and command line parameters. -### LSF +#### LSF -The batch sytem on Atlas is LSF. For general information on LSF, please follow +The batch system on Atlas is LSF. For general information on LSF, please follow [this link](platform_lsf.md). -### Submission of Parallel Jobs +#### Submission of Parallel Jobs The MPI library running on the Altix is provided by SGI and highly optimized for the ccNUMA architecture of this machine. However, communication within a partition is faster than across @@ -49,18 +134,18 @@ partitions. Take this into consideration when you submit your job. Single-partition jobs can be started like this: -```Bash +```console bsub -R "span[hosts=1]" -n 16 mpirun -np 16 a.out< ``` -Really large jobs with over 256 CPUs might run over multiple partitions. -Cross-partition jobs can be submitted via PAM like this +Really large jobs with over 256 CPUs might run over multiple partitions. Cross-partition jobs can +be submitted via PAM like this -```Bash +```console bsub -n 1024 pamrun a.out ``` -### Batch Queues +#### Batch Queues | Batch Queue | Admitted Users | Available CPUs | Default Runtime | Max. Runtime | |:---------------|:-----------------|:--------------------|:----------------|:-------------| diff --git a/doc.zih.tu-dresden.de/docs/archive/system_atlas.md b/doc.zih.tu-dresden.de/docs/archive/system_atlas.md index c31a9b5dc536cbd6c76e772b317739171c83ab11..0e744c4ab702afac9d3ac413ccfb5abd58fef817 100644 --- a/doc.zih.tu-dresden.de/docs/archive/system_atlas.md +++ b/doc.zih.tu-dresden.de/docs/archive/system_atlas.md @@ -1,11 +1,62 @@ -# Atlas +# MEGWARE PC-Farm Atlas -**This page is deprecated! Atlas is a former system!** +!!! warning -Atlas is a general purpose HPC cluster for jobs using 1 to 128 cores in parallel -([Information on the hardware](hardware_atlas.md)). + **This page is deprecated! Atlas is a former system!** -## Compiling Parallel Applications +## System + +The PC farm `Atlas` is a heterogeneous, general purpose cluster based on multicore chips AMD Opteron +6274 ("Bulldozer"). The nodes are operated by the Linux operating system SUSE SLES 11 with a 2.6 +kernel. Currently, the following hardware is installed: + +| Component | Count | +|-----------|--------| +| CPUs |AMD Opteron 6274 | +| number of cores | 5120 | +|th. peak performance | 45 TFLOPS | +|compute nodes | 4-way nodes *Saxonid* with 64 cores | +|nodes with 64 GB RAM | 48 | +|nodes with 128 GB RAM | 12 | +|nodes with 512 GB RAM | 8 | + +Mars and Deimos users: Please read the [migration hints](migrate_to_atlas.md). + +All nodes share the `/home` and `/fastfs` file system with our other HPC systems. Each +node has 180 GB local disk space for scratch mounted on `/tmp`. The jobs for the compute nodes are +scheduled by the [Platform LSF](platform_lsf.md) batch system from the login nodes +`atlas.hrsk.tu-dresden.de` . + +A QDR Infiniband interconnect provides the communication and I/O infrastructure for low latency / +high throughput data traffic. + +Users with a login on the [SGI Altix](system_altix.md) can access their home directory via NFS +below the mount point `/hpc_work`. + +### CPU AMD Opteron 6274 + +| Component | Count | +|-----------|--------| +| Clock rate | 2.2 GHz | +| cores | 16 | +| L1 data cache | 16 KB per core | +| L1 instruction cache | 64 KB shared in a *module* (i.e. 2 cores) | +| L2 cache | 2 MB per module | +| L3 cache | 12 MB total, 6 MB shared between 4 modules = 8 cores | +| FP units | 1 per module (supports fused multiply-add) | +| th. peak performance | 8.8 GFLOPS per core (w/o turbo) | + +The CPU belongs to the x86_64 family. Since it is fully capable of +running x86-code, one should compare the performances of the 32 and 64 +bit versions of the same code. + +For more architectural details, see the +[AMD Bulldozer block diagram](http://upload.wikimedia.org/wikipedia/commons/e/ec/AMD_Bulldozer_block_diagram_%288_core_CPU%29.PNG) +and [topology of Atlas compute nodes](misc/Atlas_Knoten.pdf). + +## Usage + +### Compiling Parallel Applications When loading a compiler module on Atlas, the module for the MPI implementation OpenMPI is also loaded in most cases. If not, you should explicitly load the OpenMPI module with `module load @@ -16,9 +67,9 @@ use the currently loaded compiler. To reveal the command lines behind the wrappe `-show`. For running your code, you have to load the same compiler and MPI module as for compiling the -program. Please follow te following guiedlines to run your parallel program using the batch system. +program. Please follow the outlined guidelines to run your parallel program using the batch system. -## Batch System +### Batch System Applications on an HPC system can not be run on the login node. They have to be submitted to compute nodes with dedicated resources for the @@ -33,12 +84,12 @@ user's job. Normally a job can be submitted with these data: - files for redirection of output and error messages, - executable and command line parameters. -### LSF +#### LSF -The batch sytem on Atlas is LSF. For general information on LSF, please follow +The batch system on Atlas is LSF. For general information on LSF, please follow [this link](platform_lsf.md). -### Submission of Parallel Jobs +#### Submission of Parallel Jobs To run MPI jobs ensure that the same MPI module is loaded as during compile-time. In doubt, check you loaded modules with `module list`. If you code has been compiled with the standard OpenMPI @@ -47,11 +98,11 @@ installation, you can load the OpenMPI module via `module load openmpi`. Please pay attention to the messages you get loading the module. They are more up-to-date than this manual. To submit a job the user has to use a script or a command-line like this: -```Bash +```console bsub -n <N> mpirun <program name> ``` -### Memory Limits +#### Memory Limits **Memory limits are enforced.** This means that jobs which exceed their per-node memory limit **may be killed** automatically by the batch system. @@ -79,7 +130,7 @@ or less** may be scheduled to smaller memory nodes. Have a look at the **examples below**. -#### Monitoring memory usage +#### Monitoring Memory Usage At the end of the job completion mail there will be a link to a website which shows the memory usage over time per node. This will only be @@ -87,8 +138,8 @@ available for longer running jobs (>10 min). #### Examples -| Job Spec. | Nodes Allowed | Remark | -|:--------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------| +| Job Spec. | Nodes Allowed | Remark | +|:----------|:--------------|:-------| | `bsub -n 1 -M 500` | All nodes | <= 940 Fits everywhere | | `bsub -n 64 -M 700` | All nodes | <= 940 Fits everywhere | | `bsub -n 4 -M 1800` | All nodes | Is allowed to oversubscribe on small nodes n\[001-047\] | diff --git a/doc.zih.tu-dresden.de/docs/archive/hardware_deimos.md b/doc.zih.tu-dresden.de/docs/archive/system_deimos.md similarity index 65% rename from doc.zih.tu-dresden.de/docs/archive/hardware_deimos.md rename to doc.zih.tu-dresden.de/docs/archive/system_deimos.md index a426381651f2807fb9c339e104ac4b2413aaec8f..a80890f070a92d5bcf6dc35205f72072e9ddd89a 100644 --- a/doc.zih.tu-dresden.de/docs/archive/hardware_deimos.md +++ b/doc.zih.tu-dresden.de/docs/archive/system_deimos.md @@ -1,10 +1,15 @@ # Linux Networx PC-Farm Deimos -The PC farm `Deimos` is a heterogenous cluster based on dual core AMD -Opteron CPUs. The nodes are operated by the Linux operating system SuSE -SLES 10 with a 2.6 kernel. Currently, the following hardware is -installed: +!!! warning + **This page is deprecated! Deimos is a former system!** + +The PC farm `Deimos` is a heterogeneous cluster based on dual core AMD Opteron CPUs. The nodes are +operated by the Linux operating system SuSE SLES 10 with a 2.6 kernel. Currently, the following +hardware is installed: + +| Component | Count | +|-----------|-------| |CPUs |AMD Opteron X85 dual core | |RAM per core |2 GB | |Number of cores |2584 | @@ -15,7 +20,7 @@ installed: |quad nodes (32 GB RAM) |24 | All nodes share a 68 TB on DDN hardware. Each node has per core 40 GB local disk space for scratch -mounted on `/tmp` . The jobs for the compute nodes are scheduled by the +mounted on `/tmp`. The jobs for the compute nodes are scheduled by the [Platform LSF](platform_lsf.md) batch system from the login nodes `deimos.hrsk.tu-dresden.de` . @@ -23,14 +28,16 @@ Two separate Infiniband networks (10 Gb/s) with low cascading switches provide t I/O infrastructure for low latency / high throughput data traffic. An additional gigabit Ethernet network is used for control and service purposes. -Users with a login on the [SGI Altix](hardware_altix.md) can access their home directory via NFS +Users with a login on the [SGI Altix](system_altix.md) can access their home directory via NFS below the mount point `/hpc_work`. ## CPU -The cluster is based on dual-core AMD Opteron X85 processor. One core -has the following basic properties: +The cluster is based on dual-core AMD Opteron X85 processor. One core has the following basic +properties: +| Component | Count | +|-----------|-------| |clock rate |2.6 GHz | |floating point units |2 | |peak performance |5.2 GFLOPS | diff --git a/doc.zih.tu-dresden.de/docs/archive/hardware_phobos.md b/doc.zih.tu-dresden.de/docs/archive/system_phobos.md similarity index 76% rename from doc.zih.tu-dresden.de/docs/archive/hardware_phobos.md rename to doc.zih.tu-dresden.de/docs/archive/system_phobos.md index 9f70d45161fac7363e9e0828af4b788d817fc1c9..bcd0d9cd88d758a643722669fcd50a6cbeaf99c5 100644 --- a/doc.zih.tu-dresden.de/docs/archive/hardware_phobos.md +++ b/doc.zih.tu-dresden.de/docs/archive/system_phobos.md @@ -1,12 +1,16 @@ # Linux Networx PC-Cluster Phobos -**Phobos was shut down on 1 November 2010.** +!!! warning + + **This page is deprecated! Phobos is a former system which was shut down on 1 November 2010.** `Phobos` is a cluster based on AMD Opteron CPUs. The nodes are operated by the Linux operating system SuSE SLES 9 with a 2.6 kernel. Currently, the following hardware is installed: -|CPUs \|AMD Opteron 248 (single core) | +| Component | Count | +|-----------|-------| +|CPUs |AMD Opteron 248 (single core) | |total peak performance |563.2 GFLOPS | |Number of nodes |64 compute + 1 master | |CPUs per node |2 | @@ -25,6 +29,8 @@ and service purposes. `Phobos` is based on single-core AMD Opteron 248 processor. It has the following basic properties: +| Component | Count | +|-----------|-------| |clock rate |2.2 GHz | |floating point units |2 | |peak performance |4.4 GFLOPS | @@ -32,6 +38,5 @@ following basic properties: |L2 cache |1 MB | |memory bus |128 bit x 200 MHz | -The CPU belongs to the x86_64 family. Although it is fully capable of -running x86-code, one should always try to use 64-bit programs due to -their potentially higher performance. +The CPU belongs to the x86_64 family. Although it is fully capable of running x86-code, one should +always try to use 64-bit programs due to their potentially higher performance. diff --git a/doc.zih.tu-dresden.de/docs/archive/hardware_titan.md b/doc.zih.tu-dresden.de/docs/archive/system_titan.md similarity index 50% rename from doc.zih.tu-dresden.de/docs/archive/hardware_titan.md rename to doc.zih.tu-dresden.de/docs/archive/system_titan.md index 6c383c94feafa9628f234b00a0f28f31c9f4902d..d22c774c3a52e55794a398e5533962f62df231e4 100644 --- a/doc.zih.tu-dresden.de/docs/archive/hardware_titan.md +++ b/doc.zih.tu-dresden.de/docs/archive/system_titan.md @@ -1,4 +1,8 @@ -# Windows HPC Server 2008 - Cluster Titan +# Windows HPC Server 2008 Cluster Titan + +!!! warning + + **This page is deprecated! Titan is a former system!** The Dell Blade Server `Titan` is a homogenous cluster based on quad core Intel Xeon CPUs. The cluster consists of one management and 8 compute @@ -10,16 +14,24 @@ protocol. The nodes are operated by the Windows operating system Microsoft HPC Server 2008. Currently, the following hardware is installed: -\* Compute Node: \|CPUs \|Intel Xeon E5440 Quad-Core \| \|RAM per core -\|2 GB \| \|Number of cores \|64 \| \|total peak performance \|724,48 -GFLOPS \| +* Compute Node: + +| Component | Count | +|-----------|-------| +| CPUs | Intel Xeon E5440 Quad-Core | +| RAM per core |2 GB | +| Number of cores | 64 | +| total peak performance | 724,48 GFLOPS | -\* Management Node: +* Management Node: -\|CPUs \|Intel Xeon E5410 Quad-Core \| \|RAM per core \|2 GB \| \|Number -of cores \|8 \| +| Component | Count | +|-----------|-------| +| CPUs |Intel Xeon E5410 Quad-Core | +| RAM per core | 2 GB | +| Number of cores | 8 | -\<P> The management node shares 1.2 TB disk space via NTFS over all +The management node shares 1.2 TB disk space via NTFS over all nodes. Each node has a local disk of 120 GB. The jobs for the compute nodes are scheduled by the Microsoft scheduler, which is a part of the Microsoft HPC Pack, from the management node. The job submission can be @@ -33,19 +45,26 @@ and I/O infrastructure. The cluster is based on quad core Intel Xeon E5440 processor. One core has the following basic properties: -\|clock rate \|2.83 GHz \| \|floating point units \|2 \| \|peak -performance \|11.26 GFLOPS \| \|L1 cache \|32 KB I + 32KB on chip per -core \| \|L2 cache \|12 MB I+D on chip per chip, 6MB shared/ 2 cores \| -\|FSB \|1333 MHz \| +| Component | Count | +|-----------|-------| +|clock rate | 2.83 GHz | +|floating point units | 2 | +|peak performance | 11.26 GFLOPS | +|L1 cache |32 KB I + 32KB on chip per core | +|L2 cache |12 MB I+D on chip per chip, 6MB shared/ 2 cores | +|FSB |1333 MHz | The management node is based on a quad core Intel Xeon E5410 processor. One core has the following basic properties: -\|clock rate \|2.33 GHz \| \|floating point units \|2 \| \|peak -performance \|9.32 GFLOPS \| \|L1 cache \|32 KB I + 32KB on chip per -core \| \|L2 cache \|12 MB I+D on chip per chip, 6MB shared/ 2 cores \| -\|FSB \|1333 MHz \| +| Component | Count | +|-----------|-------| +|clock rate |2.33 GHz | +|floating point units |2 | +|peak performance |9.32 GFLOPS | +|L1 cache |32 KB I + 32KB on chip per core | +|L2 cache | 12 MB I+D on chip per chip, 6MB shared/ 2 cores | +|FSB |1333 MHz | -The CPU belongs to the x86_64 family. Since it is fully capable of -running x86-code, one should compare the performances of the 32 and 64 -bit versions of the same code. +The CPU belongs to the x86_64 family. Since it is fully capable of running x86-code, one should +compare the performances of the 32 and 64 bit versions of the same code. diff --git a/doc.zih.tu-dresden.de/docs/archive/hardware_triton.md b/doc.zih.tu-dresden.de/docs/archive/system_triton.md similarity index 82% rename from doc.zih.tu-dresden.de/docs/archive/hardware_triton.md rename to doc.zih.tu-dresden.de/docs/archive/system_triton.md index 646972202c2679849ce2d7c5ac866123b55e617e..ada59bfc14208f476752c023ed9fb4f2558e15e1 100644 --- a/doc.zih.tu-dresden.de/docs/archive/hardware_triton.md +++ b/doc.zih.tu-dresden.de/docs/archive/system_triton.md @@ -1,11 +1,15 @@ -# Hardware +# IBM-iDataPlex Cluster Trition -## IBM-iDataPlex +!!! warning -is a cluster based on quadcore Intel Xeon CPUs. The nodes are operated -by the Linux operating system SuSE SLES 11. Currently, the following + **This page is deprecated! Trition is a former system!** + +Trition is a cluster based on quadcore Intel Xeon CPUs. The nodes are operated +by the Linux operating system SUSE SLES 11. Currently, the following hardware is installed: +| Component | Count | +|-----------|-------| |CPUs |Intel quadcore E5530 | |RAM per core |6 GB | |Number of cores |512 | @@ -20,6 +24,9 @@ the login node triton.hrsk.tu-dresden.de . The cluster is based on dual-core Intel Xeon E5530 processor. One core has the following basic properties: +| Component | Count | +|-----------|-------| +|CPUs |Intel quadcore E5530 | |clock rate |2.4 GHz | |Cores |4 | |Threads |8 | diff --git a/doc.zih.tu-dresden.de/docs/archive/system_venus.md b/doc.zih.tu-dresden.de/docs/archive/system_venus.md index 5e9334d02d0cd68662c2d0744464798b04b0344d..2c0a1fe2b83b1c4e7d09f5e2f6495db8658cb7f9 100644 --- a/doc.zih.tu-dresden.de/docs/archive/system_venus.md +++ b/doc.zih.tu-dresden.de/docs/archive/system_venus.md @@ -1,11 +1,32 @@ -# Venus +# SGI UV2000 / Venus -## Information about the hardware +!!! warning -Detailed information on the currect HPC hardware can be found -[here](hardware_venus.md). + **This page is deprecated! The SGI UV2000 (Venus) is a former system!** -## Login to the System +## System + +The SGI UV2000 is a shared memory system based on Intel Sandy Bridge processors. It is operated by +the Linux operating system SLES 11 SP 3 with a kernel version 3.x. + +| Component | Count | +|----------------------------|-------| +| Number of CPU sockets | 64 | +| Physical cores per sockets | 8 | +| Total number of cores | 512 | +| Total memory | 8 TiB | + +From our experience, most parallel applications benefit from using the additional hardware +hyperthreads. + +### File Systems + +Venus uses the same `home` file system as all our other HPC installations. +For computations, please use `/scratch`. + +## Usage + +### Login to the System Login to the system is available via ssh at `venus.hrsk.tu-dresden.de`. @@ -21,7 +42,7 @@ and SHA256:Qq1OrgSCTzgziKoop3a/pyVcypxRfPcZT7oUQ3V7E0E ``` -## MPI +### MPI The installation of the Message Passing Interface on Venus (SGI MPT) supports the MPI 2.2 standard (see `man mpi` ). There is no command like `mpicc`, instead you just have to use the "serial" @@ -29,7 +50,7 @@ compiler (e.g. `icc`, `icpc`, or `ifort`) and append `-lmpi` to the linker comma Example: -```Bash +```console % icc -o myprog -g -O2 -xHost myprog.c -lmpi ``` @@ -38,11 +59,11 @@ Notes: - C++ programmers: You need to link with both libraries: `-lmpi++ -lmpi`. - Fortran programmers: The MPI module is only provided for the Intel - compiler and does not work with gfortran. + compiler and does not work with `gfortran`. Please follow the following guidelines to run your parallel program using the batch system on Venus. -## Batch system +### Batch System Applications on an HPC system can not be run on the login node. They have to be submitted to compute nodes with dedicated resources for the user's job. Normally a job can be submitted with these data: @@ -56,10 +77,10 @@ nodes with dedicated resources for the user's job. Normally a job can be submitt - files for redirection of output and error messages, - executable and command line parameters. -The batch sytem on Venus is Slurm. For general information on Slurm, please follow +The batch system on Venus is Slurm. For general information on Slurm, please follow [this link](../jobs_and_resources/slurm.md). -### Submission of Parallel Jobs +#### Submission of Parallel Jobs The MPI library running on the UV is provided by SGI and highly optimized for the ccNUMA architecture of this machine. @@ -74,8 +95,7 @@ srun -n 16 a.out **Please note:** There are different MPI libraries on Taurus and Venus, so you have to compile the binaries specifically for their target. -### File Systems +#### File Systems -- The large main memory on the system allows users to create ramdisks - within their own jobs. The documentation on how to use these - ramdisks can be found [here](ram_disk_documentation.md). +- The large main memory on the system allows users to create RAM disks + within their own jobs. diff --git a/doc.zih.tu-dresden.de/docs/archive/systems_switched_off.md b/doc.zih.tu-dresden.de/docs/archive/systems_switched_off.md new file mode 100644 index 0000000000000000000000000000000000000000..c4f9890ac3ad36580c617b6fb5292cb0b1ceffcb --- /dev/null +++ b/doc.zih.tu-dresden.de/docs/archive/systems_switched_off.md @@ -0,0 +1,12 @@ +# Hardware + +HPC at ZIH has a quite long history and several systems have been installed and operated. +Documentation on former systems for future reference can be found on the following pages: + +- [SGI Altix](system_altix.md) +- [PC-Farm Atlas](system_atlas.md) +- [PC-Farm Deimos](system_deimos.md) +- [PC-Cluster Phobos](system_phobos.md) +- [Windows-HPC-Server Titan](system_titan.md) +- [PC-Cluster Triton](system_triton.md) +- [Shared-Memory-System Venus](system_venus.md) diff --git a/doc.zih.tu-dresden.de/docs/data_lifecycle/overview.md b/doc.zih.tu-dresden.de/docs/data_lifecycle/overview.md index e1b5fca65e562a243590c8fb55f92242b2265b4a..4c09103dcb0a001a954523cc0382d37cd2b3d5a6 100644 --- a/doc.zih.tu-dresden.de/docs/data_lifecycle/overview.md +++ b/doc.zih.tu-dresden.de/docs/data_lifecycle/overview.md @@ -10,7 +10,7 @@ uniformity of the project can be achieved by taking into account and setting up The used set of software within an HPC project can be management with environments on different levels either defined by [modules](../software/modules.md), [containers](../software/containers.md) -or by [Python virtual environments](../software/python.md). +or by [Python virtual environments](../software/python_virtual_environments.md). In the following, a brief overview on relevant topics w.r.t. data life cycle management is provided. ## Data Storage and Management @@ -19,27 +19,27 @@ The main concept of working with data on ZIH systems bases on [Workspaces](works properly: * use a `/home` directory for the limited amount of personal data, simple examples and the results - of calculations. The home directory is not a working directory! However, `/home` file system is + of calculations. The home directory is not a working directory! However, `/home` filesystem is [backed up](#backup) using snapshots; - * use `workspaces` as a place for working data (i.e. datasets); Recommendations of choosing the + * use `workspaces` as a place for working data (i.e. data sets); Recommendations of choosing the correct storage system for workspace presented below. -### Taxonomy of File Systems +### Taxonomy of Filesystems It is important to design your data workflow according to characteristics, like I/O footprint (bandwidth/IOPS) of the application, size of the data, (number of files,) and duration of the -storage to efficiently use the provided storage and file systems. -The page [file systems](file_systems.md) holds a comprehensive documentation on the different file -systems. +storage to efficiently use the provided storage and filesystems. +The page [filesystems](file_systems.md) holds a comprehensive documentation on the different +filesystems. <!--In general, the mechanisms of so-called--> <!--[Workspaces](workspaces.md) are compulsory for all HPC users to store data for a defined duration ---> <!--depending on the requirements and the storage system this time span might range from days to a few--> <!--years.--> -<!--- [HPC file systems](file_systems.md)--> +<!--- [HPC filesystems](file_systems.md)--> <!--- [Intermediate Archive](intermediate_archive.md)--> <!--- [Special data containers] **todo** Special data containers (was no valid link in old compendium)--> -<!--- [Move data between file systems](../data_transfer/data_mover.md)--> -<!--- [Move data to/from ZIH's file systems](../data_transfer/export_nodes.md)--> +<!--- [Move data between filesystems](../data_transfer/data_mover.md)--> +<!--- [Move data to/from ZIH's filesystems](../data_transfer/export_nodes.md)--> <!--- [Longterm Preservation for ResearchData](preservation_research_data.md)--> !!! hint "Recommendations to choose of storage system" @@ -48,7 +48,7 @@ range from days to a few--> <!--years.--> [warm_archive](file_systems.md#warm_archive) can be used. (Note that this is mounted **read-only** on the compute nodes). * For a series of calculations that works on the same data please use a `scratch` based [workspace](workspaces.md). - * **SSD**, in its turn, is the fastest available file system made only for large parallel + * **SSD**, in its turn, is the fastest available filesystem made only for large parallel applications running with millions of small I/O (input, output operations). * If the batch job needs a directory for temporary data then **SSD** is a good choice as well. The data can be deleted afterwards. @@ -60,17 +60,17 @@ otherwise it could vanish. The core data of your project should be [backed up](# ### Backup The backup is a crucial part of any project. Organize it at the beginning of the project. The -backup mechanism on ZIH systems covers **only** the `/home` and `/projects` file systems. Backed up +backup mechanism on ZIH systems covers **only** the `/home` and `/projects` filesystems. Backed up files can be restored directly by the users. Details can be found [here](file_systems.md#backup-and-snapshots-of-the-file-system). !!! warning - If you accidentally delete your data in the "no backup" file systems it **can not be restored**! + If you accidentally delete your data in the "no backup" filesystems it **can not be restored**! ### Folder Structure and Organizing Data -Organizing of living data using the file system helps for consistency and structuredness of the +Organizing of living data using the filesystem helps for consistency of the project. We recommend following the rules for your work regarding: * Organizing the data: Never change the original data; Automatize the organizing the data; Clearly @@ -81,7 +81,7 @@ project. We recommend following the rules for your work regarding: don’t replace documentation and metadata; Use standards of your discipline; Make rules for your project, document and keep them (See the [README recommendations]**todo link** below) -This is the example of an organisation (hierarchical) for the folder structure. Use it as a visual +This is the example of an organization (hierarchical) for the folder structure. Use it as a visual illustration of the above:  @@ -130,7 +130,7 @@ you don’t need throughout its life cycle. <!--## Software Packages--> -<!--As was written before the module concept is the basic concept for using software on Taurus.--> +<!--As was written before the module concept is the basic concept for using software on ZIH system.--> <!--Uniformity of the project has to be achieved by using the same set of software on different levels.--> <!--It could be done by using environments. There are two types of environments should be distinguished:--> <!--runtime environment (the project level, use scripts to load [modules]**todo link**), Python virtual--> @@ -144,16 +144,16 @@ you don’t need throughout its life cycle. <!--### Python Virtual Environment--> -<!--If you are working with the Python then it is crucial to use the virtual environment on Taurus. The--> +<!--If you are working with the Python then it is crucial to use the virtual environment on ZIH system. The--> <!--main purpose of Python virtual environments (don't mess with the software environment for modules)--> <!--is to create an isolated environment for Python projects (self-contained directory tree that--> <!--contains a Python installation for a particular version of Python, plus a number of additional--> <!--packages).--> <!--**Vitualenv (venv)** is a standard Python tool to create isolated Python environments. We--> -<!--recommend using venv to work with Tensorflow and Pytorch on Taurus. It has been integrated into the--> +<!--recommend using venv to work with Tensorflow and Pytorch on ZIH system. It has been integrated into the--> <!--standard library under the [venv module]**todo link**. **Conda** is the second way to use a virtual--> -<!--environment on the Taurus. Conda is an open-source package management system and environment--> +<!--environment on the ZIH system. Conda is an open-source package management system and environment--> <!--management system from the Anaconda.--> <!--[Detailed information]**todo link** about using the virtual environment.--> @@ -168,9 +168,9 @@ you don’t need throughout its life cycle. The concept of **permissions** and **ownership** is crucial in Linux. See the [HPC-introduction]**todo link** slides for the understanding of the main concept. Standard Linux -changing permission command (i.e `chmod`) valid for Taurus as well. The **group** access level +changing permission command (i.e `chmod`) valid for ZIH system as well. The **group** access level contains members of your project group. Be careful with 'write' permission and never allow to change the original data. -Useful links: [Data Management]**todo link**, [File Systems]**todo link**, [Get Started with -HPC-DA]**todo link**, [Project Management]**todo link**, [Preservation research data[**todo link** +Useful links: [Data Management]**todo link**, [Filesystems]**todo link**, +[Project Management]**todo link**, [Preservation research data[**todo link** diff --git a/doc.zih.tu-dresden.de/docs/data_protection_declaration.md b/doc.zih.tu-dresden.de/docs/data_protection_declaration.md new file mode 100644 index 0000000000000000000000000000000000000000..a9b56833750cab80fdec3d9c7f838dfdcbeec840 --- /dev/null +++ b/doc.zih.tu-dresden.de/docs/data_protection_declaration.md @@ -0,0 +1,15 @@ +# Datenschutzerklärung + +Zur Bereitstellung des Dienstes werden folgende personenbeziehbaren Daten verarbeitet: IP Addresse. + +Eine Nutzung dieser Daten für andere Zwecke erfolgt nicht. Eine Speicherung dieser Daten erfolgt nur +zur Fehleranalyse. Eine Ãœbermittlung dieser Daten an Dritte erfolgt nur, wenn dies gesetzlich +bestimmt ist. + +Jeder Nutzer kann sich jederzeit an den [Datenschutzbeauftragten der TU +Dresden](https://tu-dresden.de/tu-dresden/organisation/gremien-und-beauftragte/beauftragte/datenschutzbeauftragter) +sowie an die [zuständige Aufsichtsbehörde für den Datenschutz](https://www.saechsdsb.de/) wenden. + +Weiterhin besteht die Möglichkeit jederzeit Auskunft über die zu seiner Person verarbeiteten Daten +zu verlangen und es steht eine Antwort mit der Frist von einem Monat nach Eingang des +Auskunftsersuchens zu. diff --git a/doc.zih.tu-dresden.de/docs/index.md b/doc.zih.tu-dresden.de/docs/index.md index caa05b3d9092529f86514885756dd2a5f73f7827..cc174e052a72bf6258ce4844749690ae28d7a46c 100644 --- a/doc.zih.tu-dresden.de/docs/index.md +++ b/doc.zih.tu-dresden.de/docs/index.md @@ -5,8 +5,7 @@ Dear HPC users, due to restrictions coming from data security and software incompatibilities the old "HPC Compendium" is now reachable only from inside TU Dresden campus (or via VPN). -Internal users should be redirected automatically to the -[internal IP address](http://141.76.17.11/hpc-wiki/bin/view/Compendium). +Internal users should be redirected automatically. We apologize for this severe action, but we are in the middle of the preparation for a wiki relaunch, so we do not want to redirect resources to fix technical/security issues for a system diff --git a/doc.zih.tu-dresden.de/docs/jobs_and_resources/hpcda.md b/doc.zih.tu-dresden.de/docs/jobs_and_resources/hpcda.md deleted file mode 100644 index d7bdec9afe83de27488e712b07e5fd5bdbcfcd17..0000000000000000000000000000000000000000 --- a/doc.zih.tu-dresden.de/docs/jobs_and_resources/hpcda.md +++ /dev/null @@ -1,67 +0,0 @@ -# HPC for Data Analytics - -With the HPC-DA system, the TU Dresden provides infrastructure for High-Performance Computing and -Data Analytics (HPC-DA) for German researchers for computing projects with focus in one of the -following areas: - -- machine learning scenarios for large systems -- evaluation of various hardware settings for large machine learning - problems, including accelerator and compute node configuration and - memory technologies -- processing of large amounts of data on highly parallel machine - learning infrastructure. - -Currently we offer 25 Mio core hours compute time per year for external computing projects. -Computing projects have a duration of up to one year with the possibility of extensions, thus -enabling projects to continue seamlessly. Applications for regular projects on HPC-DA can be -submitted at any time via the -[online web-based submission](https://tu-dresden.de/zih/hochleistungsrechnen/zugang/hpc-da) -and review system. The reviews of the applications are carried out by experts in their respective -scientific fields. Applications are evaluated only according to their scientific excellence. - -ZIH provides a portfolio of preinstalled applications and offers support for software -installation/configuration of project-specific applications. In particular, we provide consulting -services for all our users, and advise researchers on using the resources in an efficient way. - -\<img align="right" alt="HPC-DA Overview" -src="%ATTACHURL%/bandwidth.png" title="bandwidth.png" width="250" /> - -## Access - -- Application for access using this - [Online Web Form](https://tu-dresden.de/zih/hochleistungsrechnen/zugang/hpc-da) - -## Hardware Overview - -- [Nodes for machine learning (Power9)](../jobs_and_resources/power9.md) -- [NVMe Storage](../jobs_and_resources/nvme_storage.md) (2 PB) -- [Warm archive](../data_lifecycle/file_systems.md#warm-archive) (10 PB) -- HPC nodes (x86) for DA (island 6) -- Compute nodes with high memory bandwidth: - [AMD Rome Nodes](../jobs_and_resources/rome_nodes.md) (island 7) - -Additional hardware: - -- [Multi-GPU-Cluster](../jobs_and_resources/alpha_centauri.md) for projects of SCADS.AI - -## File Systems and Object Storage - -- Lustre -- BeeGFS -- Quobyte -- S3 - -## HOWTOS - -- [Get started with HPC-DA](../software/get_started_with_hpcda.md) -- [IBM Power AI](../software/power_ai.md) -- [Work with Singularity Containers on Power9]**todo** Cloud -- [TensorFlow on HPC-DA (native)](../software/tensorflow.md) -- [Tensorflow on Jupyter notebook](../software/tensorflow_on_jupyter_notebook.md) -- Create and run your own TensorFlow container for HPC-DA (Power9) (todo: no link at all in old compendium) -- [TensorFlow on x86](../software/deep_learning.md) -- [PyTorch on HPC-DA (Power9)](../software/pytorch.md) -- [Python on HPC-DA (Power9)](../software/python.md) -- [JupyterHub](../access/jupyterhub.md) -- [R on HPC-DA (Power9)](../software/data_analytics_with_r.md) -- [Big Data frameworks: Apache Spark, Apache Flink, Apache Hadoop](../software/big_data_frameworks.md) diff --git a/doc.zih.tu-dresden.de/docs/legal_notice.md b/doc.zih.tu-dresden.de/docs/legal_notice.md new file mode 100644 index 0000000000000000000000000000000000000000..3412a3a0a511d26d1a8bf8e730161622fb7930d9 --- /dev/null +++ b/doc.zih.tu-dresden.de/docs/legal_notice.md @@ -0,0 +1,24 @@ +# Legal Notice / Impressum + +Es gilt das [Impressum der TU Dresden](https://tu-dresden.de/impressum) mit folgenden Änderungen: + +## Ansprechpartner/Betreiber: + +Technische Universität Dresden +Zentrum für Informationsdienste und Hochleistungsrechnen +01062 Dresden + +Tel.: +49 351 463-40000 +Fax: +49 351 463-42328 +E-Mail: servicedesk@tu-dresden.de + +## Konzeption, Technische Umsetzung, Anbieter: + +Technische Universität Dresden +Zentrum für Informationsdienste und Hochleistungsrechnen +Prof. Dr. Wolfgang E. Nagel +01062 Dresden + +Tel.: +49 351 463-35450 +Fax: +49 351 463-37773 +E-Mail: zih@tu-dresden.de diff --git a/doc.zih.tu-dresden.de/docs/software/big_data_frameworks_spark.md b/doc.zih.tu-dresden.de/docs/software/big_data_frameworks_spark.md index 3fda99a5acfc67b6117dd4caac2943cd35ede33c..5c3eee415359c62432a409dc1f0f55818c8986bb 100644 --- a/doc.zih.tu-dresden.de/docs/software/big_data_frameworks_spark.md +++ b/doc.zih.tu-dresden.de/docs/software/big_data_frameworks_spark.md @@ -14,7 +14,7 @@ marie@login$ module av Spark ``` The **aim** of this page is to introduce users on how to start working with -these frameworks on ZIH systems, e. g. on the [HPC-DA](../jobs_and_resources/hpcda.md) system. +these frameworks on ZIH systems. **Prerequisites:** To work with the frameworks, you need [access](../access/ssh_login.md) to ZIH systems and basic knowledge about data analysis and the batch system @@ -127,7 +127,7 @@ in an interactive job with: marie@compute$ source framework-configure.sh spark my-config-template ``` -### Using Hadoop Distributed File System (HDFS) +### Using Hadoop Distributed Filesystem (HDFS) If you want to use Spark and HDFS together (or in general more than one framework), a scheme similar to the following can be used: @@ -156,10 +156,7 @@ Please use a [batch job](../jobs_and_resources/slurm.md) similar to There are two general options on how to work with Jupyter notebooks: There is [JupyterHub](../access/jupyterhub.md), where you can simply -run your Jupyter notebook on HPC nodes (the preferable way). Also, you -can run a remote Jupyter server manually within a GPU job using -the modules and packages you need. You can find the manual server -setup [here](deep_learning.md). +run your Jupyter notebook on HPC nodes (the preferable way). ### Preparation @@ -202,14 +199,14 @@ You are now ready to spawn a notebook with Spark. Assuming that you have prepared everything as described above, you can go to [https://taurus.hrsk.tu-dresden.de/jupyter](https://taurus.hrsk.tu-dresden.de/jupyter). In the tab "Advanced", go -to the field "Preload modules" and select one of the Spark modules. +to the field `Preload modules` and select one of the Spark modules. When your Jupyter instance is started, check whether the kernel that you created in the preparation phase (see above) is shown in the top right corner of the notebook. If it is not already selected, select the kernel `haswell-py3.6-spark`. Then, you can set up Spark. Since the setup in the notebook requires more steps than in an interactive session, we have created an example notebook that you can use as a starting point -for convenience: [SparkExample.ipynb](misc/SparkExample.ipynb) +for convenience: [Spark-Example](misc/SparkExample.ipynb) !!! note diff --git a/doc.zih.tu-dresden.de/docs/software/cfd.md b/doc.zih.tu-dresden.de/docs/software/cfd.md index 3d14fb1f5c9680d2169c9a095eb1bff9f5e2ed37..492cb96d24f3761e2820fdba34eaa6b0a35db320 100644 --- a/doc.zih.tu-dresden.de/docs/software/cfd.md +++ b/doc.zih.tu-dresden.de/docs/software/cfd.md @@ -1,49 +1,49 @@ # Computational Fluid Dynamics (CFD) -| | | | | -|---------------|------------|-----------|------------| -| | **Taurus** | **Venus** | **Module** | -| **OpenFOAM** | x | | openfoam | -| **CFX** | x | x | ansys | -| **Fluent** | x | x | ansys | -| **ICEM CFD** | x | x | ansys | -| **STAR-CCM+** | x | | star | +The following CFD applications are available on our system: + +| | **Module** | +|---------------|------------| +| **OpenFOAM** | openfoam | +| **CFX** | ansys | +| **Fluent** | ansys | +| **ICEM CFD** | ansys | +| **STAR-CCM+** | star | ## OpenFOAM The OpenFOAM (Open Field Operation and Manipulation) CFD Toolbox can simulate anything from complex fluid flows involving chemical reactions, turbulence and heat transfer, to solid dynamics, -electromagnetics and the pricing of financial options. OpenFOAM is develop primarly by -[OpenCFD Ltd](https://www.openfoam.com) and is freely available and open source, +electromagnetics and the pricing of financial options. OpenFOAM is developed primarily by +[OpenCFD Ltd](https://www.openfoam.com) and is freely available and open-source, licensed under the GNU General Public Licence. -The command "module spider OpenFOAM" provides the list of installed OpenFOAM versions. In order to -use OpenFOAM, it is mandatory to set the environment by sourcing the \`bashrc\` (for users running -bash or ksh) or \`cshrc\` `(for users running tcsh` or `csh)` provided by OpenFOAM: +The command `module spider OpenFOAM` provides the list of installed OpenFOAM versions. In order to +use OpenFOAM, it is mandatory to set the environment by sourcing the `bashrc` (for users running +bash or ksh) or `cshrc` (for users running tcsh or csh) provided by OpenFOAM: -```Shell Session -module load OpenFOAM/VERSION -source $FOAM_BASH -# source $FOAM_CSH +```console +marie@login$ module load OpenFOAM/VERSION +marie@login$ source $FOAM_BASH +marie@login$ # source $FOAM_CSH ``` -Example for OpenFOAM job script: - -```Bash -#!/bin/bash -#SBATCH --time=12:00:00 # walltime -#SBATCH --ntasks=60 # number of processor cores (i.e. tasks) -#SBATCH --mem-per-cpu=500M # memory per CPU core -#SBATCH -J "Test" # job name -#SBATCH --mail-user=mustermann@tu-dresden.de # email address (only tu-dresden) -#SBATCH --mail-type=ALL - -OUTFILE="Output" -module load OpenFOAM -source $FOAM_BASH -cd /scratch/<YOURUSERNAME> # work directory in /scratch...! -srun pimpleFoam -parallel > "$OUTFILE" -``` +???+ example "Example for OpenFOAM job script:" + ```bash + #!/bin/bash + #SBATCH --time=12:00:00 # walltime + #SBATCH --ntasks=60 # number of processor cores (i.e. tasks) + #SBATCH --mem-per-cpu=500M # memory per CPU core + #SBATCH -job-name="Test" # job name + #SBATCH --mail-user=marie@tu-dresden.de # email address (only tu-dresden) + #SBATCH --mail-type=ALL + + OUTFILE="Output" + module load OpenFOAM + source $FOAM_BASH + cd /scratch/ws/1/marie-example-workspace # work directory using workspace + srun pimpleFoam -parallel > "$OUTFILE" + ``` ## Ansys CFX @@ -51,67 +51,67 @@ Ansys CFX is a powerful finite-volume-based program package for modeling general complex geometries. The main components of the CFX package are the flow solver cfx5solve, the geometry and mesh generator cfx5pre, and the post-processor cfx5post. -Example for CFX job script: - -```Bash -#!/bin/bash -#SBATCH --time=12:00 # walltime -#SBATCH --ntasks=4 # number of processor cores (i.e. tasks) -#SBATCH --mem-per-cpu=1900M # memory per CPU core -#SBATCH --mail-user=.......@tu-dresden.de # email address (only tu-dresden) -#SBATCH --mail-type=ALL +???+ example "Example for CFX job script:" + ```bash + #!/bin/bash + #SBATCH --time=12:00 # walltime + #SBATCH --ntasks=4 # number of processor cores (i.e. tasks) + #SBATCH --mem-per-cpu=1900M # memory per CPU core + #SBATCH --mail-user=marie@tu-dresden.de # email address (only tu-dresden) + #SBATCH --mail-type=ALL -module load ANSYS -cd /scratch/<YOURUSERNAME> # work directory in /scratch...! -cfx-parallel.sh -double -def StaticMixer.def -``` + module load ANSYS + cd /scratch/ws/1/marie-example-workspace # work directory using workspace + cfx-parallel.sh -double -def StaticMixer.def + ``` ## Ansys Fluent -Fluent need the hostnames and can be run in parallel like this: - -```Bash -#!/bin/bash -#SBATCH --time=12:00 # walltime -#SBATCH --ntasks=4 # number of processor cores (i.e. tasks) -#SBATCH --mem-per-cpu=1900M # memory per CPU core -#SBATCH --mail-user=.......@tu-dresden.de # email address (only tu-dresden) -#SBATCH --mail-type=ALL -module load ANSYS +???+ example "Fluent needs the host names and can be run in parallel like this:" + ```bash + #!/bin/bash + #SBATCH --time=12:00 # walltime + #SBATCH --ntasks=4 # number of processor cores (i.e. tasks) + #SBATCH --mem-per-cpu=1900M # memory per CPU core + #SBATCH --mail-user=marie@tu-dresden.de # email address (only tu-dresden) + #SBATCH --mail-type=ALL + module load ANSYS -nodeset -e $SLURM_JOB_NODELIST | xargs -n1 > hostsfile_job_$SLURM_JOBID.txt + nodeset -e $SLURM_JOB_NODELIST | xargs -n1 > hostsfile_job_$SLURM_JOBID.txt -fluent 2ddp -t$SLURM_NTASKS -g -mpi=intel -pinfiniband -cnf=hostsfile_job_$SLURM_JOBID.txt < input.in -``` + fluent 2ddp -t$SLURM_NTASKS -g -mpi=intel -pinfiniband -cnf=hostsfile_job_$SLURM_JOBID.txt < input.in + ``` -To use fluent interactive please try +To use fluent interactively, please try: -```Shell Session -module load ANSYS/19.2 -srun -N 1 --cpus-per-task=4 --time=1:00:00 --pty --x11=first bash -fluent & +```console +marie@login$ module load ANSYS/19.2 +marie@login$ srun --nodes=1 --cpus-per-task=4 --time=1:00:00 --pty --x11=first bash +marie@login$ fluent & ``` ## STAR-CCM+ -Note: you have to use your own license in order to run STAR-CCM+ on Taurus, so you have to specify -the parameters -licpath and -podkey, see the example below. - -Our installation provides a script `create_rankfile -f CCM` that generates a hostlist from the SLURM -job environment that can be passed to starccm+ enabling it to run across multiple nodes. - -```Bash -#!/bin/bash -#SBATCH --time=12:00 # walltime -#SBATCH --ntasks=32 # number of processor cores (i.e. tasks) -#SBATCH --mem-per-cpu=2500M # memory per CPU core -#SBATCH --mail-user=.......@tu-dresden.de # email address (only tu-dresden) -#SBATCH --mail-type=ALL - -module load STAR-CCM+ - -LICPATH="port@host" -PODKEY="your podkey" -INPUT_FILE="your_simulation.sim" -starccm+ -collab -rsh ssh -cpubind off -np $SLURM_NTASKS -on $(/sw/taurus/tools/slurmtools/default/bin/create_rankfile -f CCM) -batch -power -licpath $LICPATH -podkey $PODKEY $INPUT_FILE -``` +!!! note + You have to use your own license in order to run STAR-CCM+ on ZIH systems, so you have to specify + the parameters `-licpath` and `-podkey`, see the example below. + +Our installation provides a script `create_rankfile -f CCM` that generates a host list from the +Slurm job environment that can be passed to `starccm+`, enabling it to run across multiple nodes. + +???+ example + ```bash + #!/bin/bash + #SBATCH --time=12:00 # walltime + #SBATCH --ntasks=32 # number of processor cores (i.e. tasks) + #SBATCH --mem-per-cpu=2500M # memory per CPU core + #SBATCH --mail-user=marie@tu-dresden.de # email address (only tu-dresden) + #SBATCH --mail-type=ALL + + module load STAR-CCM+ + + LICPATH="port@host" + PODKEY="your podkey" + INPUT_FILE="your_simulation.sim" + starccm+ -collab -rsh ssh -cpubind off -np $SLURM_NTASKS -on $(/sw/taurus/tools/slurmtools/default/bin/create_rankfile -f CCM) -batch -power -licpath $LICPATH -podkey $PODKEY $INPUT_FILE + ``` diff --git a/doc.zih.tu-dresden.de/docs/software/compilers.md b/doc.zih.tu-dresden.de/docs/software/compilers.md index 19a70e4638aa126176c8d705d472176e4bbbb915..4292602e02e77bf01ad04c8c01643aadcc8c580a 100644 --- a/doc.zih.tu-dresden.de/docs/software/compilers.md +++ b/doc.zih.tu-dresden.de/docs/software/compilers.md @@ -1,20 +1,20 @@ # Compilers -The following compilers are available on our platforms: +The following compilers are available on the ZIH system: -| | | | | +| | GNU Compiler Collection | Intel Compiler | PGI Compiler (Nvidia HPC SDK) | |----------------------|-----------|------------|-------------| -| | **Intel** | **GNU** | **PGI** | -| **C Compiler** | `icc` | `gcc` | `pgcc` | -| **C++ Compiler** | `icpc` | `g++` | `pgc++` | -| **Fortran Compiler** | `ifort` | `gfortran` | `pgfortran` | +| Further information | [GCC website](https://gcc.gnu.org/) | [C/C++](https://software.intel.com/en-us/c-compilers), [Fortran](https://software.intel.com/en-us/fortran-compilers) | [PGI website](https://www.pgroup.com) | +| Module name | GNU | intel | PGI | +| C Compiler | `gcc` | `icc` | `pgcc` | +| C++ Compiler | `g++` | `icpc` | `pgc++` | +| Fortran Compiler | `gfortran` | `ifort` | `pgfortran` | -For an overview of the installed compiler versions, please see our automatically updated -[SoftwareModulesList]**todo**SoftwareModulesList. +For an overview of the installed compiler versions, please use `module spider <module name>` +on the ZIH systems. -All C compiler support ANSI C and C99 with a couple of different language options. The support for -Fortran77, Fortran90, Fortran95, and Fortran2003 differs from one compiler to the other. Please -check the man pages to verify that your code can be compiled. +All compilers support various language standards, at least up to ISO C11, ISO C++ 2014, and Fortran 2003. +Please check the man pages to verify that your code can be compiled. Please note that the linking of C++ files normally requires the C++ version of the compiler to link the correct libraries. @@ -24,89 +24,59 @@ the correct libraries. Common options are: - `-g` to include information required for debugging -- `-pg` to generate gprof -style sample-based profiling information during the run +- `-pg` to generate gprof-like sample-based profiling information during the run - `-O0`, `-O1`, `-O2`, `-O3` to customize the optimization level from no (`-O0`) to aggressive (`-O3`) optimization - `-I` to set search path for header files - `-L` to set search path for libraries -Please note that aggressive optimization allows deviation from the strict IEEE arithmetic. Since the -performance impact of options like `-mp` is very hard the user herself has to balance speed and -desired accuracy of her application. There are several options for profiling, profile-guided -optimization, data alignment and so on. You can list all available compiler options with the option -`-help`. Reading the man-pages is a good idea, too. - -The user benefits from the (nearly) same set of compiler flags for optimization for the C,C++, and -Fortran-compilers. In the following table, only a couple of important compiler-dependent options are -listed. For more detailed information, the user should refer to the man pages or use the option --help to list all options of the compiler. - -\| **GCC** \| **Open64** \| **Intel** \| **PGI** \| **Pathscale** \| -Description\* \| - -| | | | | | | -|----------------------|--------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------|-----------------|-------------------------------------------------------------------------------------| -| `-fopenmp` | `-mp` | `-openmp` | `-mp` | `-mp` | turn on OpenMP support | -| `-ieee-fp` | `-fno-fast-math` | `-mp` | `-Kieee` | `-no-fast-math` | use this flag to limit floating-point optimizations and maintain declared precision | -| `-ffast-math` | `-ffast-math` | `-mp1` | `-Knoieee` | `-ffast-math` | some floating-point optimizations are allowed, less performance impact than `-mp` . | -| `-Ofast` | `-Ofast` | `-fast` | `-fast` | `-Ofast` | Maximize performance, implies a couple of other flags | -| | | `-fpe`<span class="twiki-macro FOOTNOTE">ifort only</span> `-ftz`<span class="twiki-macro FOOTNOTE">flushes denormalized numbers to zero: On Itanium 2 an underflow raises an underflow exception that needs to be handled in software. This takes about 1000 cycles!</span> | `-Ktrap`... | | Controls the behavior of the processor when floating-point exceptions occur. | -| `-mavx` `-msse4.2` | `-mavx` `-msse4.2` | `-msse4.2` | `-fastsse` | `-mavx` | "generally optimal flags" for supporting SSE instructions | -| | `-ipa` | `-ipo` | `-Mipa` | `-ipa` | inter procedure optimization (across files) | -| | | `-ip` | `-Mipa` | | inter procedure optimization (within files) | -| | `-apo` | `-parallel` | `-Mconcur` | `-apo` | Auto-parallelizer | -| `-fprofile-generate` | | `-prof-gen` | `-Mpfi` | `-fb-create` | Create instrumented code to generate profile in file \<FN> | -| `-fprofile-use` | | `-prof-use` | `-Mpfo` | `-fb-opt` | Use profile data for optimization. - Leave all other optimization options | - -*We can not generally give advice as to which option should be used - even -O0 sometimes leads to a -fast code. To gain maximum performance please test the compilers and a few combinations of -optimization flags. In case of doubt, you can also contact ZIH and ask the staff for help.* - -### Vector Extensions - -To build an executable for different node types (e.g. Sandybridge and -Westmere) the option `-msse4.2 -axavx` (for Intel compilers) uses SSE4.2 -as default path and runs along a different execution path if AVX is -available. This increases the size of the program code (might result in +Please note that aggressive optimization allows deviation from the strict IEEE arithmetic. +Since the performance impact of options like `-fp-model strict` is very hard you +have to balance speed and desired accuracy of your application yourself. + +The user benefits from the (nearly) same set of compiler flags for optimization for the C, C++, and +Fortran-compilers. +In the following table, only a couple of important compiler-dependent options are listed. +For more detailed information about these and further flags, the user should refer to the man +pages or use the option `--help` to list all options of the compiler. + +| GCC | Intel | PGI | Description | +|----------------------|--------------|-------------|-------------------------------------------------------------------------------------| +| `-fopenmp` | `-fopenmp` | `-mp` | turn on OpenMP support | +| `-std=c99`, `-std=c++11`, `-std=f2018` | `-std=c99`, `-std=c++11`, `-std18` | `-c99`, `--c++11`, n/a | set language standard, for example C99, C++11, Fortran 2018 | +| `-mieee-fp` `-frounding-math` | `-fp-model precise` or `-fp-model strict` | `-Kieee` | limit floating-point optimizations and maintain declared precision | +| `-ffast-math` | `-mp1` or `-fp-model fast` | `-Mfprelaxed` | allow floating-point optimizations, may violate IEEE conformance | +| `-Ofast` | `-fast` | `-fast` | Maximize performance, implies a couple of other flags | +| `-fsignaling-nans` `-fno-trapping-math` | C/C++: `-fpe-trap`, Fortran: `-fpe-all` | `-Ktrap` | controls the behavior when floating-point exceptions occur | +| `-mavx` `-msse4.2` | `-mavx` `-msse4.2` | `-fastsse` | "generally optimal flags" for supporting SSE instructions | +| `-flto` | `-ipo` | `-Mipa` | interprocedural / link-time optimization (across source files) | +| `-floop-parallelize-all -ftree-parallelize-loops=<numthreads>` | `-parallel` | `-Mconcur` | auto-parallelizer | +| `-fprofile-generate` | `-prof-gen` | `-Mpfi` | create instrumented code to generate profile in file | +| `-fprofile-use` | `-prof-use` | `-Mpfo` | use profile data for optimization | + +!!! note + We can not generally give advice as to which option should be used. + To gain maximum performance please test the compilers and a few combinations of + optimization flags. + In case of doubt, you can also contact [HPC support](../support.md) and ask the staff for help. + +### Architecture-specific Optimizations + +Different architectures of CPUs feature different vector extensions (like SSE4.2 and AVX) +to accelerate computations. +The following matrix shows proper compiler flags for the architectures at the ZIH: + +| Architecture | GCC | Intel | PGI | +|--------------------|----------------------|----------------------|-----| +| Intel Haswell | `-march=haswell` | `-march=haswell` | `-tp=haswell` | +| AMD Rome | `-march=znver2` | `-march=core-avx2` | `-tp=zen` | +| Intel Cascade Lake | `-march=cascadelake` | `-march=cascadelake` | `-tp=skylake` | +| Host's architecture | `-march=native` | `-xHost` | | + +To build an executable for different node types (e.g. Cascade Lake with AVX512 and +Haswell without AVX512) the option `-march=haswell -axcascadelake` (for Intel compilers) +uses vector extension up to AVX2 as default path and runs along a different execution +path if AVX512 is available. +This increases the size of the program code (might result in poorer L1 instruction cache hits) but enables to run the same program on different hardware types. - -To optimize for the host architecture, the flags: - -| GCC | Intel | -|:--------------|:-------| -| -march=native | -xHost | - -can be used. - -The following matrix shows some proper optimization flags for the -different hardware in Taurus, as of 2020-04-08: - -| Arch | GCC | Intel Compiler | -|:-----------------------|:-------------------|:-----------------| -| **Intel Sandy Bridge** | -march=sandybridge | -xAVX | -| **Intel Haswell** | -march=haswell | -xCORE-AVX2 | -| **AMD Rome** | -march=znver2 | -march=core-avx2 | -| **Intel Cascade Lake** | -march=cascadelake | -xCOMMON-AVX512 | - -## Compiler Optimization Hints - -To achieve the best performance the compiler needs to exploit the -parallelism in the code. Therefore it is sometimes necessary to provide -the compiler with some hints. Some possible directives are (Fortran -style): - -| | | -|--------------------------|------------------------------------| -| `CDEC$ ivdep` | ignore assumed vector dependencies | -| `CDEC$ swp` | try to software-pipeline | -| `CDEC$ noswp` | disable software-pipeline | -| `CDEC$ loop count (n)` | hint for optimization | -| `CDEC$ distribute point` | split this large loop | -| `CDEC$ unroll (n)` | unroll (n) times | -| `CDEC$ nounroll` | do not unroll | -| `CDEC$ prefetch a` | prefetch array a | -| `CDEC$ noprefetch a` | do not prefetch array a | - -The compiler directives are the same for `ifort` and `icc` . The syntax for C/C++ is like `#pragma -ivdep`, `#pragma swp`, and so on. diff --git a/doc.zih.tu-dresden.de/docs/software/data_analytics.md b/doc.zih.tu-dresden.de/docs/software/data_analytics.md index b6b88dddb45b70ed76cf0c6feaf8c8a5f9052820..31ce02047f2ad70209a8613bf179634dcc643893 100644 --- a/doc.zih.tu-dresden.de/docs/software/data_analytics.md +++ b/doc.zih.tu-dresden.de/docs/software/data_analytics.md @@ -9,7 +9,7 @@ The following tools are available in the ZIH system, among others: 1. [Python](data_analytics_with_python.md) 1. [R](data_analytics_with_r.md) -1. [Rstudio](data_analytics_with_rstudio.md) +1. [RStudio](data_analytics_with_rstudio.md) 1. [Big Data framework Spark](big_data_frameworks_spark.md) 1. [MATLAB and Mathematica](mathematics.md) @@ -28,7 +28,7 @@ Likewise software can be used within [containers](containers.md). For the transfer of larger amounts of data into and within the system, the [export nodes and data mover](../data_transfer/overview.md) should be used. -The data storage takes place in the [work spaces](../data_lifecycle/workspaces.md). +The data storage takes place in the [workspaces](../data_lifecycle/workspaces.md). Software modules or virtual environments can also be installed in workspaces to enable collaborative work even within larger groups. General recommendations for setting up workflows can be found in the [experiments](../data_lifecycle/experiments.md) section. diff --git a/doc.zih.tu-dresden.de/docs/software/data_analytics_with_python.md b/doc.zih.tu-dresden.de/docs/software/data_analytics_with_python.md index ea971155b89b8b7eb3abe7fde6a60dbe231ef9dc..222a8ce0b5b173fb29f57a062e62ee21bdeaed3e 100644 --- a/doc.zih.tu-dresden.de/docs/software/data_analytics_with_python.md +++ b/doc.zih.tu-dresden.de/docs/software/data_analytics_with_python.md @@ -5,212 +5,158 @@ allows you to work with python quicker and more effective. Here the general intr with python on ZIH system is given. For specific machine learning frameworks see respective documentation in [machine learning](machine_learning.md) section. -## Python Virtual Environments +## Python Console and Virtual Environments Often it is useful to create an isolated development environment, which can be shared among a research group and/or teaching class. For this purpose python virtual environments can be used. For more details see [here](python_virtual_environments.md). +The interactive Python interpreter can also be used on ZIH systems via an interactive job: + +```console +marie@login$ srun -p alpha --gres=gpu:1 -n 1 -c 7 --pty --mem-per-cpu=8000 bash #Job submission on alpha nodes with 1 gpu on 1 node with 8000 Mb per CPU +marie@alpha$ python +``` + ## Jupyter Notebooks Jupyter notebooks are a great way for interactive computing in a web browser. They allow working with data cleaning and transformation, -numerical simulation, statistical modelling, data visualization and machine learning. +numerical simulation, statistical modeling, data visualization and machine learning. On ZIH system a [JupyterHub](../access/jupyterhub.md) is available, which can be used to run -a Jupyter notebook on an HPC node, as well using a GPU when needed. +a Jupyter notebook on a node, as well using a GPU when needed. -## Dask +## Parallel Computing with Python -**Dask** is an open-source library for parallel computing. Dask is a flexible library for parallel -computing in Python. +### Dask -Dask natively scales Python. It provides advanced parallelism for analytics, enabling performance at -scale for some of the popular tools. For instance: Dask arrays scale Numpy workflows, Dask +[Dask](https://dask.org/) is a flexible and open-source library for parallel computing in Python. +It scales Python and provides advanced parallelism for analytics, enabling performance at +scale for some of the popular tools. For instance: Dask arrays scale NumPy workflows, Dask dataframes scale Pandas workflows, Dask-ML scales machine learning APIs like Scikit-Learn and XGBoost. Dask is composed of two parts: -- Dynamic task scheduling optimized for computation and interactive - computational workloads. -- Big Data collections like parallel arrays, data frames, and lists - that extend common interfaces like NumPy, Pandas, or Python - iterators to larger-than-memory or distributed environments. These - parallel collections run on top of dynamic task schedulers. +- Dynamic task scheduling optimized for computation and interactive computational workloads. +- Big Data collections like parallel arrays, data frames, and lists that extend common interfaces + like NumPy, Pandas, or Python iterators to larger-than-memory or distributed environments. + These parallel collections run on top of dynamic task schedulers. Dask supports several user interfaces: -High-Level: - -- Arrays: Parallel NumPy -- Bags: Parallel lists -- DataFrames: Parallel Pandas -- Machine Learning : Parallel Scikit-Learn -- Others from external projects, like XArray - -Low-Level: - -- Delayed: Parallel function evaluation -- Futures: Real-time parallel function evaluation - -### Installation - -### Installation Using Conda - -Dask is installed by default in [Anaconda](https://www.anaconda.com/download/). To install/update -Dask on a Taurus with using the [conda](https://www.anaconda.com/download/) follow the example: - -```Bash -# Job submission in ml nodes with allocating: 1 node, 1 gpu per node, 4 hours -srun -p ml -N 1 -n 1 --mem-per-cpu=5772 --gres=gpu:1 --time=04:00:00 --pty bash -``` - -Create a conda virtual environment. We would recommend using a workspace. See the example (use -`--prefix` flag to specify the directory). - -**Note:** You could work with simple examples in your home directory (where you are loading by -default). However, in accordance with the -[HPC storage concept](../data_lifecycle/hpc_storage_concept2019.md) please use a -[workspaces](../data_lifecycle/workspaces.md) for your study and work projects. - -```Bash -conda create --prefix /scratch/ws/0/aabc1234-Workproject/conda-virtual-environment/dask-test python=3.6 +- High-Level + - Arrays: Parallel NumPy + - Bags: Parallel lists + - DataFrames: Parallel Pandas + - Machine Learning: Parallel Scikit-Learn + - Others from external projects, like XArray +- Low-Level + - Delayed: Parallel function evaluation + - Futures: Real-time parallel function evaluation + +#### Dask Installation + +!!! hint + This step might be obsolete, since the library may be already available as a module. + Check it with + ```console + marie@compute$ module spider dask + ``` + +The installation of Dask is very easy and can be done by a user using a [virtual environment](python_virtual_environments.md) + +```console +marie@compute$ module load SciPy-bundle/2020.11-fosscuda-2020b Pillow/8.0.1-GCCcore-10.2.0 +marie@compute$ virtualenv --system-site-packages dask-test +created virtual environment CPython3.8.6.final.0-64 in 10325ms + creator CPython3Posix(dest=~/dask-test, clear=False, global=True) + seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=~/.local/share/virtualenv) + added seed packages: pip==21.1.3, setuptools==57.4.0, wheel==0.36.2 + activators BashActivator,CShellActivator,FishActivator,PowerShellActivator,PythonActivator,XonshActivator +marie@compute$ source dask-test/bin/activate +(dask-test) marie@compute$ pip install dask dask-jobqueue +[...] +marie@compute$ python -c "import dask; print(dask.__version__)" +2021.08.1 ``` -By default, conda will locate the environment in your home directory: - -```Bash -conda create -n dask-test python=3.6 -``` - -Activate the virtual environment, install Dask and verify the installation: - -```Bash -ml modenv/ml -ml PythonAnaconda/3.6 -conda activate /scratch/ws/0/aabc1234-Workproject/conda-virtual-environment/dask-test python=3.6 -which python -which conda -conda install dask -python - -from dask.distributed import Client, progress -client = Client(n_workers=4, threads_per_worker=1) -client -``` - -### Installation Using Pip - -You can install everything required for most common uses of Dask (arrays, dataframes, etc) - -```Bash -srun -p ml -N 1 -n 1 --mem-per-cpu=5772 --gres=gpu:1 --time=04:00:00 --pty bash - -cd /scratch/ws/0/aabc1234-Workproject/python-virtual-environment/dask-test +The preferred and simplest way to run Dask on ZIH system is using +[dask-jobqueue](https://jobqueue.dask.org/). -ml modenv/ml -module load PythonAnaconda/3.6 -which python +**TODO** create better example with jobqueue -python3 -m venv --system-site-packages dask-test -source dask-test/bin/activate -python -m pip install "dask[complete]" - -python +```python from dask.distributed import Client, progress client = Client(n_workers=4, threads_per_worker=1) client ``` -Distributed scheduler - -? - -### Run Dask on Taurus - -The preferred and simplest way to run Dask on HPC systems today both for new, experienced users or -administrator is to use [dask-jobqueue](https://jobqueue.dask.org/). - -You can install dask-jobqueue with `pip` or `conda` - -Installation with Pip - -```Bash -srun -p haswell -N 1 -n 1 -c 4 --mem-per-cpu=2583 --time=01:00:00 --pty bash -cd -/scratch/ws/0/aabc1234-Workproject/python-virtual-environment/dask-test -ml modenv/ml module load PythonAnaconda/3.6 which python - -source dask-test/bin/activate pip -install dask-jobqueue --upgrade # Install everything from last released version -``` - -Installation with Conda - -```Bash -srun -p haswell -N 1 -n 1 -c 4 --mem-per-cpu=2583 --time=01:00:00 --pty bash - -ml modenv/ml module load PythonAnaconda/3.6 source -dask-test/bin/activate - -conda install dask-jobqueue -c conda-forge\</verbatim> -``` - -## MPI for Python +### mpi4py - MPI for Python Message Passing Interface (MPI) is a standardized and portable message-passing standard designed to function on a wide variety of parallel computing architectures. The Message Passing Interface (MPI) is a library specification that allows HPC to pass information between its -various nodes and clusters. MPI designed to provide access to advanced +various nodes and clusters. MPI is designed to provide access to advanced parallel hardware for end-users, library writers and tool developers. -### Why use MPI? - -MPI provides a powerful, efficient and portable way to express parallel -programs. -Among many parallel computational models, message-passing has proven to be an effective one. - -### Parallel Python with mpi4py - -Mpi4py(MPI for Python) package provides bindings of the MPI standard for +mpi4py(MPI for Python) package provides bindings of the MPI standard for the python programming language, allowing any Python program to exploit multiple processors. -#### Why use mpi4py? - -Mpi4py based on MPI-2 C++ bindings. It supports almost all MPI calls. +mpi4py based on MPI-2 C++ bindings. It supports almost all MPI calls. This implementation is popular on Linux clusters and in the SciPy community. Operations are primarily methods of communicator objects. It -supports communication of pickleable Python objects. Mpi4py provides +supports communication of pickle-able Python objects. mpi4py provides optimized communication of NumPy arrays. -Mpi4py is included as an extension of the SciPy-bundle modules on -taurus. - -Please check the SoftwareModulesList for the modules availability. The availability of the mpi4py -in the module you can check by -the `module whatis <name_of_the module>` command. The `module whatis` -command displays a short information and included extensions of the -module. - -Moreover, it is possible to install mpi4py in your local conda -environment: - -```Bash -srun -p ml --time=04:00:00 -n 1 --pty --mem-per-cpu=8000 bash #allocate recources -module load modenv/ml -module load PythonAnaconda/3.6 #load module to use conda -conda create --prefix=<location_for_your_environment> python=3.6 anaconda #create conda virtual environment - -conda activate <location_for_your_environment> #activate your virtual environment +mpi4py is included as an extension of the SciPy-bundle modules on a ZIH system + +```console +marie@compute$ module load SciPy-bundle/2020.11-foss-2020b +Module SciPy-bundle/2020.11-foss-2020b and 28 dependencies loaded. +marie@compute$ pip list +Package Version +----------------------------- ---------- +[...] +mpi4py 3.0.3 +[...] +``` -conda install -c conda-forge mpi4py #install mpi4py +Other versions of the package can be found with + +```console +marie@compute$ module spider mpi4py +----------------------------------------------------------------------------------------------------------------------------------------- + mpi4py: +----------------------------------------------------------------------------------------------------------------------------------------- + Versions: + mpi4py/1.3.1 + mpi4py/2.0.0-impi + mpi4py/3.0.0 (E) + mpi4py/3.0.2 (E) + mpi4py/3.0.3 (E) + +Names marked by a trailing (E) are extensions provided by another module. + +----------------------------------------------------------------------------------------------------------------------------------------- + For detailed information about a specific "mpi4py" package (including how to load the modules) use the module's full name. + Note that names that have a trailing (E) are extensions provided by other modules. + For example: + + $ module spider mpi4py/3.0.3 +----------------------------------------------------------------------------------------------------------------------------------------- +``` -python #start python +Check if mpi4py is running correctly -from mpi4py import MPI #verify your mpi4py +```python +from mpi4py import MPI comm = MPI.COMM_WORLD print("%d of %d" % (comm.Get_rank(), comm.Get_size())) ``` + +**TODO** verify mpi4py installation + diff --git a/doc.zih.tu-dresden.de/docs/software/data_analytics_with_r.md b/doc.zih.tu-dresden.de/docs/software/data_analytics_with_r.md index b307dad4e880bf0420bf413df3d106e6728e5fb4..a17f4974272edd8f521d91bc7dcc5b3612427de3 100644 --- a/doc.zih.tu-dresden.de/docs/software/data_analytics_with_r.md +++ b/doc.zih.tu-dresden.de/docs/software/data_analytics_with_r.md @@ -1,53 +1,44 @@ # R for Data Analytics [R](https://www.r-project.org/about.html) is a programming language and environment for statistical -computing and graphics. R provides a wide variety of statistical (linear and nonlinear modelling, +computing and graphics. R provides a wide variety of statistical (linear and nonlinear modeling, classical statistical tests, time-series analysis, classification, etc) and graphical techniques. R is an integrated suite of software facilities for data manipulation, calculation and graphing. -R possesses an extensive catalogue of statistical and graphical methods. It includes machine +R possesses an extensive catalog of statistical and graphical methods. It includes machine learning algorithms, linear regression, time series, statistical inference. -We recommend using **Haswell** and/or **Romeo** partitions to work with R. For more details +We recommend using `haswell` and/or `rome` partitions to work with R. For more details see [here](../jobs_and_resources/hardware_taurus.md). ## R Console -This is a quickstart example. The `srun` command is used to submit a real-time execution job +In the following example the `srun` command is used to submit a real-time execution job designed for interactive use with monitoring the output. Please check [the Slurm page](../jobs_and_resources/slurm.md) for details. -```Bash -# job submission on haswell nodes with allocating: 1 task, 1 node, 4 CPUs per task with 2541 mb per CPU(core) for 1 hour -tauruslogin$ srun --partition=haswell --ntasks=1 --nodes=1 --cpus-per-task=4 --mem-per-cpu=2541 --time=01:00:00 --pty bash - -# Ensure that you are using the scs5 environment -module load modenv/scs5 -# Check all available modules for R with version 3.6 -module available R/3.6 -# Load default R module -module load R -# Checking the current R version -which R -# Start R console -R +```console +marie@login$ srun --partition=haswell --ntasks=1 --nodes=1 --cpus-per-task=4 --mem-per-cpu=2541 --time=01:00:00 --pty bash +marie@compute$ module load modenv/scs5 +marie@compute$ module available R/3.6 +marie@compute$ module load R +marie@compute$ which R +marie@compute$ R ``` Using `srun` is recommended only for short test runs, while for larger runs batch jobs should be -used. The examples can be found [here](get_started_with_hpcda.md) or -[here](../jobs_and_resources/slurm.md). +used. The examples can be found [here](../jobs_and_resources/slurm.md). It is also possible to run `Rscript` command directly (after loading the module): ```Bash -# Run Rscript directly. For instance: Rscript /scratch/ws/0/marie-study_project/my_r_script.R -Rscript /path/to/script/your_script.R param1 param2 +Rscript /path/to/script/your_script.R <param1> <param2> ``` ## R in JupyterHub -In addition to using interactive and batch jobs, it is possible to work with **R** using +In addition to using interactive and batch jobs, it is possible to work with R using [JupyterHub](../access/jupyterhub.md). The production and test [environments](../access/jupyterhub.md#standard-environments) of @@ -59,17 +50,15 @@ For using R with RStudio please refer to [Data Analytics with RStudio](data_anal ## Install Packages in R -By default, user-installed packages are saved in the users home in a subfolder depending on -the architecture (x86 or PowerPC). Therefore the packages should be installed using interactive +By default, user-installed packages are saved in the users home in a folder depending on +the architecture (`x86` or `PowerPC`). Therefore the packages should be installed using interactive jobs on the compute node: -```Bash -srun -p haswell --ntasks=1 --nodes=1 --cpus-per-task=4 --mem-per-cpu=2541 --time=01:00:00 --pty bash - -module purge -module load modenv/scs5 -module load R -R -e 'install.packages("package_name")' #For instance: 'install.packages("ggplot2")' +```console +marie@compute$ module load R +Module R/3.6.0-foss-2019a and 56 dependencies loaded. +marie@compute$ R -e 'install.packages("ggplot2")' +[...] ``` ## Deep Learning with R @@ -81,29 +70,21 @@ Therefore, using nodes with built-in GPUs ([ml](../jobs_and_resources/power9.md) ### R Interface to TensorFlow The ["TensorFlow" R package](https://tensorflow.rstudio.com/) provides R users access to the -Tensorflow toolset. [TensorFlow](https://www.tensorflow.org/) is an open-source software library +TensorFlow framework. [TensorFlow](https://www.tensorflow.org/) is an open-source software library for numerical computation using data flow graphs. -```Bash -srun --partition=ml --ntasks=1 --nodes=1 --cpus-per-task=7 --mem-per-cpu=5772 --gres=gpu:1 --time=04:00:00 --pty bash +The respective modules can be loaded with the following -module purge -ml modenv/ml -ml TensorFlow -ml R - -which python -mkdir python-virtual-environments # Create a folder for virtual environments -cd python-virtual-environments -python3 -m venv --system-site-packages R-TensorFlow #create python virtual environment -source R-TensorFlow/bin/activate #activate environment -module list -which R +```console +marie@compute$ module load R/3.6.2-fosscuda-2019b +Module R/3.6.2-fosscuda-2019b and 63 dependencies loaded. +marie@compute$ module load TensorFlow/2.3.1-fosscuda-2019b-Python-3.7.4 +Module TensorFlow/2.3.1-fosscuda-2019b-Python-3.7.4 and 15 dependencies loaded. ``` -Please allocate the job with respect to -[hardware specification](../jobs_and_resources/hardware_taurus.md)! Note that the nodes on `ml` -partition have 4way-SMT, so for every physical core allocated, you will always get 4\*1443Mb=5772mb. +!!! warning + Be aware that for compatibility reasons it is important to choose modules with + the same toolchain version (in this case `fosscuda/2019b`). For reference see [here](modules.md) In order to interact with Python-based frameworks (like TensorFlow) `reticulate` R library is used. To configure it to point to the correct Python executable in your virtual environment, create @@ -111,23 +92,39 @@ a file named `.Rprofile` in your project directory (e.g. R-TensorFlow) with the contents: ```R -Sys.setenv(RETICULATE_PYTHON = "/sw/installed/Anaconda3/2019.03/bin/python") #assign the output of the 'which python' from above to RETICULATE_PYTHON +Sys.setenv(RETICULATE_PYTHON = "/sw/installed/Python/3.7.4-GCCcore-8.3.0/bin/python") #assign the output of the 'which python' from above to RETICULATE_PYTHON ``` Let's start R, install some libraries and evaluate the result: -```R -install.packages("reticulate") -library(reticulate) -reticulate::py_config() -install.packages("tensorflow") -library(tensorflow) -tf$constant("Hello Tensorflow") #In the output 'Tesla V100-SXM2-32GB' should be mentioned +```rconsole +> install.packages(c("reticulate", "tensorflow")) +Installing packages into ‘~/R/x86_64-pc-linux-gnu-library/3.6’ +(as ‘lib’ is unspecified) +> reticulate::py_config() +python: /software/rome/Python/3.7.4-GCCcore-8.3.0/bin/python +libpython: /sw/installed/Python/3.7.4-GCCcore-8.3.0/lib/libpython3.7m.so +pythonhome: /software/rome/Python/3.7.4-GCCcore-8.3.0:/software/rome/Python/3.7.4-GCCcore-8.3.0 +version: 3.7.4 (default, Mar 25 2020, 13:46:43) [GCC 8.3.0] +numpy: /software/rome/SciPy-bundle/2019.10-fosscuda-2019b-Python-3.7.4/lib/python3.7/site-packages/numpy +numpy_version: 1.17.3 + +NOTE: Python version was forced by RETICULATE_PYTHON + +> library(tensorflow) +2021-08-26 16:11:47.110548: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 +> tf$constant("Hello TensorFlow") +2021-08-26 16:14:00.269248: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1 +2021-08-26 16:14:00.674878: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: +pciBusID: 0000:0b:00.0 name: A100-SXM4-40GB computeCapability: 8.0 +coreClock: 1.41GHz coreCount: 108 deviceMemorySize: 39.59GiB deviceMemoryBandwidth: 1.41TiB/s +[...] +tf.Tensor(b'Hello TensorFlow', shape=(), dtype=string) ``` ??? example The example shows the use of the TensorFlow package with the R for the classification problem - related to the MNIST dataset. + related to the MNIST data set. ```R library(tensorflow) library(keras) @@ -203,20 +200,15 @@ tf$constant("Hello Tensorflow") #In the output 'Tesla V100-SXM2-32GB' sh ## Parallel Computing with R Generally, the R code is serial. However, many computations in R can be made faster by the use of -parallel computations. Taurus allows a vast number of options for parallel computations. Large -amounts of data and/or use of complex models are indications to use parallelization. - -### General Information about the R Parallelism - -There are various techniques and packages in R that allow parallelization. This section -concentrates on most general methods and examples. The Information here is Taurus-specific. +parallel computations. This section concentrates on most general methods and examples. The [parallel](https://www.rdocumentation.org/packages/parallel/versions/3.6.2) library will be used below. -**Warning:** Please do not install or update R packages related to parallelism as it could lead to -conflicts with other pre-installed packages. +!!! warning + Please do not install or update R packages related to parallelism as it could lead to + conflicts with other preinstalled packages. -### Basic Lapply-Based Parallelism +### Basic lapply-Based Parallelism `lapply()` function is a part of base R. lapply is useful for performing operations on list-objects. Roughly speaking, lapply is a vectorization of the source code and it is the first step before @@ -294,9 +286,10 @@ running in parallel. The desired type of the cluster can be specified with a par This way of the R parallelism uses the [Rmpi](http://cran.r-project.org/web/packages/Rmpi/index.html) package and the [MPI](https://en.wikipedia.org/wiki/Message_Passing_Interface) (Message Passing Interface) as a -"backend" for its parallel operations. The MPI-based job in R is very similar to submitting an +"back-end" for its parallel operations. The MPI-based job in R is very similar to submitting an [MPI Job](../jobs_and_resources/slurm.md#binding-and-distribution-of-tasks) since both are running -multicore jobs on multiple nodes. Below is an example of running R script with the Rmpi on Taurus: +multicore jobs on multiple nodes. Below is an example of running R script with the Rmpi on +ZIH system: ```Bash #!/bin/bash @@ -304,8 +297,8 @@ multicore jobs on multiple nodes. Below is an example of running R script with t #SBATCH --ntasks=32 # this parameter determines how many processes will be spawned, please use >=8 #SBATCH --cpus-per-task=1 #SBATCH --time=01:00:00 -#SBATCH -o test_Rmpi.out -#SBATCH -e test_Rmpi.err +#SBATCH --output=test_Rmpi.out +#SBATCH --error=test_Rmpi.err module purge module load modenv/scs5 @@ -322,10 +315,10 @@ However, in some specific cases, you can specify the number of nodes and the num tasks per node explicitly: ```Bash -#!/bin/bash #SBATCH --nodes=2 #SBATCH --tasks-per-node=16 #SBATCH --cpus-per-task=1 + module purge module load modenv/scs5 module load R @@ -394,7 +387,7 @@ Another example: #snow::stopCluster(cl) # usually it hangs over here with OpenMPI > 2.0. In this case this command may be avoided, Slurm will clean up after the job finishes ``` -To use Rmpi and MPI please use one of these partitions: **haswell**, **broadwell** or **rome**. +To use Rmpi and MPI please use one of these partitions: `haswell`, `broadwell` or `rome`. Use `mpirun` command to start the R script. It is a wrapper that enables the communication between processes running on different nodes. It is important to use `-np 1` (the number of spawned diff --git a/doc.zih.tu-dresden.de/docs/software/data_analytics_with_rstudio.md b/doc.zih.tu-dresden.de/docs/software/data_analytics_with_rstudio.md index 46f5ac3985d56d8d3c8d84d39376559ae3a4fe6b..7fd89780ade18eb15b7cc116ff89b1a778d876f2 100644 --- a/doc.zih.tu-dresden.de/docs/software/data_analytics_with_rstudio.md +++ b/doc.zih.tu-dresden.de/docs/software/data_analytics_with_rstudio.md @@ -11,8 +11,8 @@ similarly to a new kernel from [JupyterLab](../access/jupyterhub.md#jupyterlab) {: style="width:90%" } ???tip - If an error "could not start rstudio in time" occurs, try reloading the webpage with F5. + If an error "could not start RStudio in time" occurs, try reloading the web page with F5. ???note - Please note that it is currently not recommended to use an interactive x11 job with the - desktop version of RStudio, as described, for example, in introduction to HPC-DA slides. + Please note that it is currently not recommended to use an interactive `x11` job with the + desktop version of RStudio as described in the introductory slides. diff --git a/doc.zih.tu-dresden.de/docs/software/debuggers.md b/doc.zih.tu-dresden.de/docs/software/debuggers.md index fafb8c705f30a9e4b026d549b656aa7a0516540a..d88ca5f068f0145e8acc46407feca93a14968522 100644 --- a/doc.zih.tu-dresden.de/docs/software/debuggers.md +++ b/doc.zih.tu-dresden.de/docs/software/debuggers.md @@ -1,9 +1,16 @@ -# Debuggers +# Debugging -This section describes how to start the debuggers on the ZIH systems. +Debugging is an essential but also rather time consuming step during application development. Tools +dramatically reduce the amount of time spent to detect errors. Besides the "classical" serial +programming errors, which may usually be easily detected with a regular debugger, there exist +programming errors that result from the usage of OpenMP, Pthreads, or MPI. These errors may also be +detected with debuggers (preferably debuggers with support for parallel applications), however, +specialized tools like MPI checking tools (e.g. Marmot) or thread checking tools (e.g. Intel Thread +Checker) can simplify this task. -Detailed information about how to use the debuggers can be found on the -website of the debuggers (see below). +This page provides detailed information on classic debugging at ZIH systems. The more specific +topic [MPI Usage Error Detection](mpi_usage_error_detection.md) covers tools to detect MPI usage +errors. ## Overview of available Debuggers at ZIH @@ -17,30 +24,30 @@ website of the debuggers (see below). ## General Advices -- You need to compile your code with the flag `-g` to enable - debugging. This tells the compiler to include information about - variable and function names, source code lines etc. into the - executable. -- It is also recommendable to reduce or even disable optimizations - (`-O0` or gcc's `-Og`). At least inlining should be disabled (usually - `-fno-inline`). -- For parallel applications: try to reproduce the problem with less - processes or threads before using a parallel debugger. -- Use the compiler's check capabilites to find typical problems at - compile time or run time, read the manual (`man gcc`, `man ifort`, etc.) - - Intel C++ example: `icpc -g -std=c++14 -w3 -check=stack,uninit -check-pointers=rw -fp-trap=all` - - Intel Fortran example: `ifort -g -std03 -warn all -check all -fpe-all=0 -traceback` - - The flag `-traceback` of the Intel Fortran compiler causes to print - stack trace and source code location when the program terminates - abnormally. -- If your program crashes and you get an address of the failing - instruction, you can get the source code line with the command - `addr2line -e <executable> <address>` (if compiled with `-g`). -- Use [Memory Debuggers](#memory-debugging) to - verify the proper usage of memory. -- Core dumps are useful when your program crashes after a long - runtime. -- Slides from user training: [Introduction to Parallel Debugging](misc/debugging_intro.pdf) +- You need to compile your code with the flag `-g` to enable + debugging. This tells the compiler to include information about + variable and function names, source code lines etc. into the + executable. +- It is also recommendable to reduce or even disable optimizations + (`-O0` or gcc's `-Og`). At least inlining should be disabled (usually + `-fno-inline`). +- For parallel applications: try to reproduce the problem with less + processes or threads before using a parallel debugger. +- Use the compiler's check capabilities to find typical problems at + compile time or run time, read the manual (`man gcc`, `man ifort`, etc.) + - Intel C++ example: `icpc -g -std=c++14 -w3 -check=stack,uninit -check-pointers=rw -fp-trap=all` + - Intel Fortran example: `ifort -g -std03 -warn all -check all -fpe-all=0 -traceback` + - The flag `-traceback` of the Intel Fortran compiler causes to print + stack trace and source code location when the program terminates + abnormally. +- If your program crashes and you get an address of the failing + instruction, you can get the source code line with the command + `addr2line -e <executable> <address>` (if compiled with `-g`). +- Use [Memory Debuggers](#memory-debugging) to + verify the proper usage of memory. +- Core dumps are useful when your program crashes after a long + runtime. +- Slides from user training: [Introduction to Parallel Debugging](misc/debugging_intro.pdf) ## GNU Debugger (GDB) @@ -55,34 +62,28 @@ several ways: | Attach running program to GDB | `gdb --pid <process ID>` | | Open a core dump | `gdb <executable> <core file>` | -This [GDB Reference -Sheet](http://users.ece.utexas.edu/~adnan/gdb-refcard.pdf) makes life -easier when you often use GDB. +This [GDB Reference Sheet](http://users.ece.utexas.edu/~adnan/gdb-refcard.pdf) makes life easier +when you often use GDB. -Fortran 90 programmers may issue an -`module load ddt` before their debug session. This makes the GDB -modified by DDT available, which has better support for Fortran 90 (e.g. -derived types). +Fortran 90 programmers may issue an `module load ddt` before their debug session. This makes the GDB +modified by DDT available, which has better support for Fortran 90 (e.g. derived types). ## Arm DDT  -- Intuitive graphical user interface and great support for parallel applications -- We have 1024 licences, so many user can use this tool for parallel - debugging -- Don't expect that debugging an MPI program with 100ths of process - will always work without problems - - The more processes and nodes involved, the higher is the - probability for timeouts or other problems - - Debug with as few processes as required to reproduce the bug you - want to find -- Module to load before using: `module load ddt` -- Start: `ddt <executable>` -- If the GUI runs too slow over your remote connection: - Use [WebVNC](../access/graphical_applications_with_webvnc.md) to start a remote desktop - session in a web browser. -- Slides from user training: [Parallel Debugging with DDT](misc/debugging_ddt.pdf) +- Intuitive graphical user interface and great support for parallel applications +- We have 1024 licences, so many user can use this tool for parallel debugging +- Don't expect that debugging an MPI program with 100ths of process will always work without + problems + - The more processes and nodes involved, the higher is the probability for timeouts or other + problems + - Debug with as few processes as required to reproduce the bug you want to find +- Module to load before using: `module load ddt` Start: `ddt <executable>` If the GUI runs too slow +- over your remote connection: + Use [WebVNC](../access/graphical_applications_with_webvnc.md) to start a remote desktop session in + a web browser. +- Slides from user training: [Parallel Debugging with DDT](misc/debugging_ddt.pdf) ### Serial Program Example @@ -95,9 +96,9 @@ srun: job 123456 has been allocated resources marie@compute$ ddt ./myprog ``` -- Run dialog window of DDT opens. -- Optionally: configure options like program arguments. -- Hit *Run*. +- Run dialog window of DDT opens. +- Optionally: configure options like program arguments. +- Hit *Run*. ### Multi-threaded Program Example @@ -110,10 +111,10 @@ srun: job 123457 has been allocated resources marie@compute$ ddt ./myprog ``` -- Run dialog window of DDT opens. -- Optionally: configure options like program arguments. -- If OpenMP: set number of threads. -- Hit *Run*. +- Run dialog window of DDT opens. +- Optionally: configure options like program arguments. +- If OpenMP: set number of threads. +- Hit *Run*. ### MPI-Parallel Program Example @@ -128,27 +129,27 @@ salloc: Granted job allocation 123458 marie@login$ ddt srun ./myprog ``` -- Run dialog window of DDT opens. -- If MPI-OpenMP-hybrid: set number of threads. -- Hit *Run* +- Run dialog window of DDT opens. +- If MPI-OpenMP-hybrid: set number of threads. +- Hit *Run* ## Memory Debugging -- Memory debuggers find memory management bugs, e.g. - - Use of non-initialized memory - - Access memory out of allocated bounds -- DDT has memory debugging included (needs to be enabled in the run dialog) +- Memory debuggers find memory management bugs, e.g. + - Use of non-initialized memory + - Access memory out of allocated bounds +- DDT has memory debugging included (needs to be enabled in the run dialog) ### Valgrind (Memcheck) -- Simulation of the program run in a virtual machine which accurately observes memory operations. -- Extreme run time slow-down: use small program runs! -- Finds more memory errors than other debuggers. -- Further information: - - [Valgrind Website](http://www.valgrind.org) - - [Memcheck Manual](https://www.valgrind.org/docs/manual/mc-manual.html) - (explanation of output, command-line options) -- For serial or multi-threaded programs: +- Simulation of the program run in a virtual machine which accurately observes memory operations. +- Extreme run time slow-down: use small program runs! +- Finds more memory errors than other debuggers. +- Further information: + - [Valgrind Website](http://www.valgrind.org) + - [Memcheck Manual](https://www.valgrind.org/docs/manual/mc-manual.html) + (explanation of output, command-line options) +- For serial or multi-threaded programs: ```console marie@login$ module load Valgrind @@ -156,12 +157,12 @@ Module Valgrind/3.14.0-foss-2018b and 12 dependencies loaded. marie@login$ srun -n 1 valgrind ./myprog ``` -- Not recommended for MPI parallel programs, since usually the MPI library will throw - a lot of errors. But you may use valgrind the following way such that every rank - writes its own valgrind logfile: +- Not recommended for MPI parallel programs, since usually the MPI library will throw + a lot of errors. But you may use Valgrind the following way such that every rank + writes its own Valgrind logfile: ```console marie@login$ module load Valgrind Module Valgrind/3.14.0-foss-2018b and 12 dependencies loaded. -marie@login$ srun -n <number of processes> valgrind --log-file=valgrind-%p.out ./myprog +marie@login$ srun -n <number of processes> valgrind --log-file=valgrind-%p.out ./myprog ``` diff --git a/doc.zih.tu-dresden.de/docs/software/distributed_training.md b/doc.zih.tu-dresden.de/docs/software/distributed_training.md index 98695dc1204a9c9223aa78ff49391519988c9b50..ed5305532adf58cb5a5823901f9a35235ba5c9cb 100644 --- a/doc.zih.tu-dresden.de/docs/software/distributed_training.md +++ b/doc.zih.tu-dresden.de/docs/software/distributed_training.md @@ -60,14 +60,14 @@ package to synchronize gradients and buffers. The tutorial could be found [here](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html). -To use distributed data parallelisation on Taurus please use following +To use distributed data parallelization on ZIH system please use following parameters: `--ntasks-per-node` -parameter to the number of GPUs you use per node. Also, it could be useful to increase `memomy/cpu` parameters if you run larger models. Memory can be set up to: ---mem=250000 and --cpus-per-task=7 for the **ml** partition. +`--mem=250000` and `--cpus-per-task=7` for the `ml` partition. ---mem=60000 and --cpus-per-task=6 for the **gpu2** partition. +`--mem=60000` and `--cpus-per-task=6` for the `gpu2` partition. Keep in mind that only one memory parameter (`--mem-per-cpu` = <MB> or `--mem`=<MB>) can be specified @@ -83,7 +83,7 @@ TensorFlow. #### Why use Horovod? -Horovod allows you to easily take a single-GPU TensorFlow and Pytorch +Horovod allows you to easily take a single-GPU TensorFlow and PyTorch program and successfully train it on many GPUs! In some cases, the MPI model is much more straightforward and requires far less code changes than the distributed code from TensorFlow for @@ -94,7 +94,7 @@ in some cases better results than pure TensorFlow and PyTorch. Horovod is available as a module with **TensorFlow** or **PyTorch**for **all** module environments. Please check the [software module list](modules.md) for the current version of the software. -Horovod can be loaded like other software on the Taurus: +Horovod can be loaded like other software on ZIH system: ```Bash ml av Horovod #Check available modules with Python @@ -140,21 +140,21 @@ conda create --prefix=<location_for_your_environment> python=3.6 anaconda conda activate <location_for_your_environment> #activate virtual environment ``` -Install Pytorch (not recommended) +Install PyTorch (not recommended) ```Bash cd /tmp -git clone https://github.com/pytorch/pytorch #clone Pytorch from the source +git clone https://github.com/pytorch/pytorch #clone PyTorch from the source cd pytorch #go to folder git checkout v1.7.1 #Checkout version (example: 1.7.1) git submodule update --init #Update dependencies python setup.py install #install it with python ``` -##### Install Horovod for Pytorch with python and pip +##### Install Horovod for PyTorch with python and pip -In the example presented installation for the Pytorch without -TensorFlow. Adapt as required and refer to the horovod documentation for +In the example presented installation for the PyTorch without +TensorFlow. Adapt as required and refer to the Horovod documentation for details. ```Bash diff --git a/doc.zih.tu-dresden.de/docs/software/hyperparameter_optimization.md b/doc.zih.tu-dresden.de/docs/software/hyperparameter_optimization.md index 1c612b58327b0c18baad109e296a25fd5f2c5544..437f5c498d7250cb080497bfde1cae1bfa01a1fb 100644 --- a/doc.zih.tu-dresden.de/docs/software/hyperparameter_optimization.md +++ b/doc.zih.tu-dresden.de/docs/software/hyperparameter_optimization.md @@ -10,12 +10,12 @@ are intuitive testing, grid search or random search. The tool OmniOpt performs hyperparameter optimization within a broad range of applications as classical simulations or machine learning algorithms. -Omniopt is robust and it checks and installs all dependencies automatically and fixes many -problems in the background. While Omniopt optimizes, no further intervention is required. -You can follow the ongoing stdout (standard output) live in the console. -Omniopt’s overhead is minimal and virtually imperceptible. +OmniOpt is robust and it checks and installs all dependencies automatically and fixes many +problems in the background. While OmniOpt optimizes, no further intervention is required. +You can follow the ongoing output live in the console. +Overhead of OmniOpt is minimal and virtually imperceptible. -## Quickstart with OmniOpt +## Quick start with OmniOpt The following instructions demonstrate the basic usage of OmniOpt on the ZIH system, based on the hyperparameter optimization for a neural network. @@ -31,7 +31,7 @@ The typical OmniOpt workflow comprises at least the following steps: The following example application script was created from [https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html](https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html){:target="_blank"} as a starting point. -Therein, a neural network is trained on the MNIST Fashion dataset. +Therein, a neural network is trained on the MNIST Fashion data set. There are three script preparation steps for OmniOpt: @@ -43,7 +43,7 @@ There are three script preparation steps for OmniOpt: ??? note "Parsing arguments in Python" There are many ways for parsing arguments into Python scripts. - The most easiest approach is the sys module (see + The most easiest approach is the `sys` module (see [https://www.geeksforgeeks.org/how-to-use-sys-argv-in-python/](https://www.geeksforgeeks.org/how-to-use-sys-argv-in-python/){:target="_blank"}), which would be fully sufficient for usage with OmniOpt. Nevertheless, this basic approach has no consistency checks or error handling etc. @@ -51,7 +51,7 @@ There are three script preparation steps for OmniOpt: + Mark the output of the optimization target (chosen here: average loss) by prefixing it with the RESULT string. OmniOpt takes the **last appearing value** prefixed with the RESULT string. - In the example different epochs are performed and the average from the last epoch is catched + In the example different epochs are performed and the average from the last epoch is caught by OmniOpt. Additionally, the RESULT output has to be a **single line**. After all these changes, the final script is as follows (with the lines containing relevant changes highlighted). @@ -92,7 +92,7 @@ There are three script preparation steps for OmniOpt: num_nodes_out1 = args.out_layer1 num_nodes_out2 = args.out_layer2 - # Download training data from open datasets. + # Download training data from open data sets. training_data = datasets.FashionMNIST( root="data", train=True, @@ -100,7 +100,7 @@ There are three script preparation steps for OmniOpt: transform=ToTensor(), ) - # Download test data from open datasets. + # Download test data from open data sets. test_data = datasets.FashionMNIST( root="data", train=False, @@ -196,15 +196,16 @@ There are three script preparation steps for OmniOpt: ### Configure and Run OmniOpt -As a starting point, configuring OmniOpt is done via a GUI at [https://imageseg.scads.ai/omnioptgui/](https://imageseg.scads.ai/omnioptgui/). -This GUI guides through the configuration process and as result the config file is created -automatically according to the GUI input. If you are more familiar with using OmniOpt later on, this -config file can be modified directly without using the GUI. +As a starting point, configuring OmniOpt is done via a GUI at +[https://imageseg.scads.ai/omnioptgui/](https://imageseg.scads.ai/omnioptgui/). +This GUI guides through the configuration process and as result the configuration file is created +automatically according to the GUI input. If you are more familiar with using OmniOpt later on, +this configuration file can be modified directly without using the GUI. -A screenshot of the GUI, including a properly configuration for the MNIST fashion example is shown -below. The GUI, in which the displayed values are already entered, can be reached [here](https://imageseg.scads.ai/omnioptgui/?maxevalserror=5&mem_per_worker=1000&projectname=mnist-fashion-optimization-set-1&partition=alpha&searchtype=tpe.suggest&objective_program=bash%20%2Fscratch%2Fws%2Fpath%2Fto%2Fyou%2Fscript%2Frun-mnist-fashion.sh%20(%24x_0)%20(%24x_1)%20(%24x_2)¶m_0_type=hp.randint¶m_1_type=hp.randint&number_of_parameters=3){:target="_blank"}. +A screenshot of the GUI, including a properly configuration for the MNIST fashion example is shown below. +The GUI, in which the displayed values are already entered, can be reached [here](https://imageseg.scads.ai/omnioptgui/?maxevalserror=5&mem_per_worker=1000&projectname=mnist-fashion-optimization-set-1&partition=alpha&searchtype=tpe.suggest&objective_program=bash%20%2Fscratch%2Fws%2Fpath%2Fto%2Fyou%2Fscript%2Frun-mnist-fashion.sh%20(%24x_0)%20(%24x_1)%20(%24x_2)¶m_0_type=hp.randint¶m_1_type=hp.randint&number_of_parameters=3){:target="_blank"}. - +![GUI for configuring OmniOpt]**TODO**(misc/hyperparameter_optimization-OmniOpt-GUI.png) {: align="center"} ### Check and Evaluate OmniOpt Results diff --git a/doc.zih.tu-dresden.de/docs/software/machine_learning.md b/doc.zih.tu-dresden.de/docs/software/machine_learning.md index 0654060b61eda54b40c27e6263132ec23198214e..e3a84c96c2e74a66e35ebcb656aaabf7686d9ca3 100644 --- a/doc.zih.tu-dresden.de/docs/software/machine_learning.md +++ b/doc.zih.tu-dresden.de/docs/software/machine_learning.md @@ -24,8 +24,8 @@ ml partition has 6x Tesla V-100 GPUs. You can find a detailed specification of t On the `ml` partition load the module environment: ```console -marie@login$ srun -p ml --gres=gpu:1 -n 1 -c 7 --pty --mem-per-cpu=8000 bash #Job submission in ml nodes with 1 gpu on 1 node with 8000 Mb per CPU -marie@ml$ module load modenv/ml #example output: The following have been reloaded with a version change: 1) modenv/scs5 => modenv/ml +marie@ml$ module load modenv/ml +The following have been reloaded with a version change: 1) modenv/scs5 => modenv/ml ``` ### Power AI @@ -44,8 +44,8 @@ space (/tmp) on an NVMe device. You can find more details of the partition [here On the **Alpha** partition load the module environment: ```console -marie@login$ srun -p alpha --gres=gpu:1 -n 1 -c 7 --pty --mem-per-cpu=8000 bash #Job submission on alpha nodes with 1 gpu on 1 node with 8000 Mb per CPU -marie@romeo$ module load modenv/scs5 +marie@alpha$ module load modenv/scs5 +The following have been reloaded with a version change: 1) modenv/ml => modenv/scs5 ``` ## Machine Learning via Console @@ -72,7 +72,7 @@ For more details on machine learning or data science with R see [here](../data_a The [Jupyter Notebook](https://jupyter.org/) is an open-source web application that allows you to create documents containing live code, equations, visualizations, and narrative text. [JupyterHub](../access/jupyterhub.md) -allows to work with machine learning frameworks (e.g. TensorFlow or Pytorch) on Taurus and to run +allows to work with machine learning frameworks (e.g. TensorFlow or PyTorch) on ZIH system and to run your Jupyter notebooks on HPC nodes. After accessing JupyterHub, you can start a new session and configure it. For machine learning @@ -88,13 +88,12 @@ container system is a widely used tool. Docker containers can also be used by Si find further information on working with containers on ZIH systems [here](containers.md) There are two sources for containers for Power9 architecture with -Tensorflow and PyTorch on the board: +TensorFlow and PyTorch on the board: -* [Tensorflow-ppc64le](https://hub.docker.com/r/ibmcom/tensorflow-ppc64le): - Community-supported ppc64le docker container for TensorFlow. +* [TensorFlow-ppc64le](https://hub.docker.com/r/ibmcom/tensorflow-ppc64le): + Community-supported `ppc64le` docker container for TensorFlow. * [PowerAI container](https://hub.docker.com/r/ibmcom/powerai/): - Official Docker container with Tensorflow, PyTorch and many other packages. - Heavy container. It requires a lot of space. Could be found on Taurus. + Official Docker container with TensorFlow, PyTorch and many other packages. Note: You could find other versions of software in the container on the "tag" tab on the docker web page of the container. @@ -103,19 +102,20 @@ In the following example, we build a Singularity container with TensorFlow from start it: ```console -marie@login$ srun -p ml -N 1 --gres=gpu:1 --time=02:00:00 --pty --mem-per-cpu=8000 bash #allocating resourses from ml nodes to start the job to create a container. marie@ml$ singularity build my-ML-container.sif docker://ibmcom/tensorflow-ppc64le #create a container from the DockerHub with the last TensorFlow version +[...] marie@ml$ singularity run --nv my-ML-container.sif #run my-ML-container.sif container supporting the Nvidia's GPU. You can also work with your container by: singularity shell, singularity exec +[...] ``` ## Additional Libraries for Machine Learning The following NVIDIA libraries are available on all nodes: -| | | -|-------|---------------------------------------| -| NCCL | /usr/local/cuda/targets/ppc64le-linux | -| cuDNN | /usr/local/cuda/targets/ppc64le-linux | +| | | +|-------|-----------------------------------------| +| NCCL | `/usr/local/cuda/targets/ppc64le-linux` | +| cuDNN | `/usr/local/cuda/targets/ppc64le-linux` | Note: For optimal NCCL performance it is recommended to set the **NCCL_MIN_NRINGS** environment variable during execution. You can try @@ -129,14 +129,14 @@ marie@compute$ export NCCL_MIN_NRINGS=4 The following HPC related software is installed on all nodes: -| | | -|------------------|------------------------| -| IBM Spectrum MPI | /opt/ibm/spectrum_mpi/ | -| PGI compiler | /opt/pgi/ | -| IBM XLC Compiler | /opt/ibm/xlC/ | -| IBM XLF Compiler | /opt/ibm/xlf/ | -| IBM ESSL | /opt/ibmmath/essl/ | -| IBM PESSL | /opt/ibmmath/pessl/ | +| | | +|------------------|--------------------------| +| IBM Spectrum MPI | `/opt/ibm/spectrum_mpi/` | +| PGI compiler | `/opt/pgi/` | +| IBM XLC Compiler | `/opt/ibm/xlC/` | +| IBM XLF Compiler | `/opt/ibm/xlf/` | +| IBM ESSL | `/opt/ibmmath/essl/` | +| IBM PESSL | `/opt/ibmmath/pessl/` | ## Datasets for Machine Learning @@ -148,7 +148,7 @@ still need to download some datasets use [DataMover](../../data_transfer/data_mo ### The ImageNet dataset The ImageNet project is a large visual database designed for use in visual object recognition -software research. In order to save space in the file system by avoiding to have multiple duplicates +software research. In order to save space in the filesystem by avoiding to have multiple duplicates of this lying around, we have put a copy of the ImageNet database (ILSVRC2012 and ILSVR2017) under `/scratch/imagenet` which you can use without having to download it again. For the future, the ImageNet dataset will be available in warm_archive. ILSVR2017 also includes a dataset for diff --git a/doc.zih.tu-dresden.de/docs/software/overview.md b/doc.zih.tu-dresden.de/docs/software/overview.md index 835d22204fcda298899f49d4b2a95b092b7e3da1..f8f4bf32b66c73234ad6db3cb728662e0d33dd7e 100644 --- a/doc.zih.tu-dresden.de/docs/software/overview.md +++ b/doc.zih.tu-dresden.de/docs/software/overview.md @@ -29,11 +29,11 @@ list]**todo link**. <!--After logging in, you are on one of the login nodes. They are not meant for work, but only for the--> <!--login process and short tests. Allocating resources will be done by batch system--> -<!--[SLURM](../jobs_and_resources/slurm.md).--> +<!--[Slurm](../jobs_and_resources/slurm.md).--> ## Modules -Usage of software on HPC systems, e.g., frameworks, compilers, loader and libraries, is +Usage of software on ZIH systems, e.g., frameworks, compilers, loader and libraries, is almost always managed by a **modules system**. Thus, it is crucial to be familiar with the [modules concept and its commands](modules.md). A module is a user interface that provides utilities for the dynamic modification of a user's environment without manual modifications. @@ -47,8 +47,8 @@ The [Jupyter Notebook](https://jupyter.org/) is an open-source web application t documents containing live code, equations, visualizations, and narrative text. There is a [JupyterHub](../access/jupyterhub.md) service on ZIH systems, where you can simply run your Jupyter notebook on compute nodes using [modules](#modules), preloaded or custom virtual environments. -Moreover, you can run a [manually created remote jupyter server](deep_learning.md) for more specific -cases. +Moreover, you can run a [manually created remote jupyter server](../archive/install_jupyter.md) +for more specific cases. ## Containers diff --git a/doc.zih.tu-dresden.de/docs/software/python_virtual_environments.md b/doc.zih.tu-dresden.de/docs/software/python_virtual_environments.md index 9d04be6db8c6aa82ebb4f8ee17e341c39be19a4d..b4c2db5bb9af31f4d2c0e080289742b07fc6b359 100644 --- a/doc.zih.tu-dresden.de/docs/software/python_virtual_environments.md +++ b/doc.zih.tu-dresden.de/docs/software/python_virtual_environments.md @@ -1,131 +1,122 @@ # Python Virtual Environments -## ToDo +Virtual environments allow users to install additional python packages and create an isolated +run-time environment. We recommend using `virtualenv` for this purpose. In your virtual environment, +you can use packages from the [modules list](modules.md) or if you didn't find what you need you can +install required packages with the command: `pip install`. With the command `pip freeze`, you can +see a list of all installed packages and their versions. -Link to this page from other DA/ML topics. +There are two methods of how to work with virtual environments on ZIH systems: -## copied from alpha_centauri.md +1. **virtualenv** is a standard Python tool to create isolated Python environments. + It is the preferred interface for + managing installations and virtual environments on ZIH system and part of the Python modules. -??? comment - copied from `alpha_centauri.md`. Please remove there if this article is finished +2. **conda** is an alternative method for managing installations and +virtual environments on ZIH system. conda is an open-source package +management system and environment management system from Anaconda. The +conda manager is included in all versions of Anaconda and Miniconda. -Virtual environments allow users to install additional python packages and create an isolated -runtime environment. We recommend using `virtualenv` for this purpose. +!!! warning + Keep in mind that you **cannot** use virtualenv for working + with the virtual environments previously created with conda tool and + vice versa! Prefer virtualenv whenever possible. -```console -marie@login$ srun --partition=alpha-interactive --nodes=1 --cpus-per-task=1 --gres=gpu:1 --time=01:00:00 --pty bash -marie@alpha$ mkdir python-environments # please use workspaces -marie@alpha$ module load modenv/hiera GCC/10.2.0 CUDA/11.1.1 OpenMPI/4.0.5 PyTorch -Module GCC/10.2.0, CUDA/11.1.1, OpenMPI/4.0.5, PyTorch/1.9.0 and 54 dependencies loaded. -marie@alpha$ which python -/sw/installed/Python/3.8.6-GCCcore-10.2.0/bin/python -marie@alpha$ pip list -[...] -marie@alpha$ virtualenv --system-site-packages python-environments/my-torch-env -created virtual environment CPython3.8.6.final.0-64 in 42960ms - creator CPython3Posix(dest=~/python-environments/my-torch-env, clear=False, global=True) - seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=~/.local/share/virtualenv) - added seed packages: pip==21.1.3, setuptools==57.2.0, wheel==0.36.2 - activators BashActivator,CShellActivator,FishActivator,PowerShellActivator,PythonActivator,XonshActivator -marie@alpha$ source python-environments/my-torch-env/bin/activate -(my-torch-env) marie@alpha$ pip install torchvision -[...] -Installing collected packages: torchvision -Successfully installed torchvision-0.10.0 -[...] -(my-torch-env) marie@alpha$ python -c "import torchvision; print(torchvision.__version__)" -0.10.0+cu102 -(my-torch-env) marie@alpha$ deactivate -``` +## Python Virtual Environment -## copied from python.md +This example shows how to start working with **virtualenv** and Python virtual environment (using +the module system). -??? comment - clear up the following. Maybe leave only conda stuff... +??? hint + We recommend to use [workspaces](../../data_lifecycle/workspaces) for your virtual environments. -There are two methods of how to work with virtual environments on -Taurus: +At first we check available Python modules and load the preferred version: -1. **Vitualenv** is a standard Python tool to create isolated Python environments. - It is the preferred interface for - managing installations and virtual environments on Taurus and part of the Python modules. +```console +marie@compute$ module avail Python #Check the available modules with Python +[...] +marie@compute$ module load Python #Load default Python +Module Python/3.7 2-GCCcore-8.2.0 with 10 dependencies loaded +marie@compute$ which python #Check which python are you using +/sw/installed/Python/3.7.2-GCCcore-8.2.0/bin/python +``` -2. **Conda** is an alternative method for managing installations and -virtual environments on Taurus. Conda is an open-source package -management system and environment management system from Anaconda. The -conda manager is included in all versions of Anaconda and Miniconda. +Then create the virtual environment and activate it. -**Note:** Keep in mind that you **cannot** use virtualenv for working -with the virtual environments previously created with conda tool and -vice versa! Prefer virtualenv whenever possible. +```console +marie@compute$ ws_allocate -F scratch python_virtual_environment 1 +Info: creating workspace. +/scratch/ws/1/python_virtual_environment +[...] +marie@compute$ virtualenv --system-site-packages /scratch/ws/1/python_virtual_environment/env #Create virtual environment +[...] +marie@compute$ source /scratch/ws/1/python_virtual_environment/env/bin/activate #Activate virtual environment. Example output: (envtest) bash-4.2$ +``` -This example shows how to start working -with **Virtualenv** and Python virtual environment (using the module system) +Now you can work in this isolated environment, without interfering with other tasks running on the +system. Note that the inscription (env) at the beginning of each line represents that you are in +the virtual environment. You can deactivate the environment as follows: -```Bash -srun -p ml -N 1 -n 1 -c 7 --mem-per-cpu=5772 --gres=gpu:1 --time=04:00:00 --pty bash #Job submission in ml nodes with 1 gpu on 1 node. +```console +(env) marie@compute$ deactivate #Leave the virtual environment +``` -mkdir python-environments # Optional: Create folder. Please use Workspaces! +## Conda Virtual Environment -module load modenv/ml # Changing the environment. Example output: The following have been reloaded with a version change: 1 modenv/scs5 => modenv/ml -ml av Python #Check the available modules with Python -module load Python #Load default Python. Example output: Module Python/3.7 4-GCCcore-8.3.0 with 7 dependencies loaded -which python #Check which python are you using -virtualenv --system-site-packages python-environments/envtest #Create virtual environment -source python-environments/envtest/bin/activate #Activate virtual environment. Example output: (envtest) bash-4.2$ -python #Start python +This example shows how to start working with **conda** and virtual environment (with using module +system). At first we use an interactive job and create a directory for the conda virtual +environment: -from time import gmtime, strftime -print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) #Example output: 2019-11-18 13:54:16 -deactivate #Leave the virtual environment +```console +marie@compute$ ws_allocate -F scratch conda_virtual_environment 1 +Info: creating workspace. +/scratch/ws/1/conda_virtual_environment +[...] ``` -The [virtualenv](https://virtualenv.pypa.io/en/latest/) Python module (Python 3) provides support -for creating virtual environments with their own sitedirectories, optionally isolated from system -site directories. Each virtual environment has its own Python binary (which matches the version of -the binary that was used to create this environment) and can have its own independent set of -installed Python packages in its site directories. This allows you to manage separate package -installations for different projects. It essentially allows us to create a virtual isolated Python -installation and install packages into that virtual installation. When you switch projects, you can -simply create a new virtual environment and not have to worry about breaking the packages installed -in other environments. - -In your virtual environment, you can use packages from the (Complete List of -Modules)(SoftwareModulesList) or if you didn't find what you need you can install required packages -with the command: `pip install`. With the command `pip freeze`, you can see a list of all installed -packages and their versions. - -This example shows how to start working with **Conda** and virtual -environment (with using module system) - -```Bash -srun -p ml -N 1 -n 1 -c 7 --mem-per-cpu=5772 --gres=gpu:1 --time=04:00:00 --pty bash # Job submission in ml nodes with 1 gpu on 1 node. - -module load modenv/ml -mkdir conda-virtual-environments #create a folder -cd conda-virtual-environments #go to folder -which python #check which python are you using -module load PythonAnaconda/3.6 #load Anaconda module -which python #check which python are you using now - -conda create -n conda-testenv python=3.6 #create virtual environment with the name conda-testenv and Python version 3.6 -conda activate conda-testenv #activate conda-testenv virtual environment - -conda deactivate #Leave the virtual environment +Then we load Anaconda, create an environment in our directory and activate the environment: + +```console +marie@compute$ module load Anaconda3 #load Anaconda module +marie@compute$ conda create --prefix /scratch/ws/1/conda_virtual_environment/conda-testenv python=3.6 #create virtual environment with Python version 3.6 +marie@compute$ conda activate /scratch/ws/1/conda_virtual_environment/conda-testenv #activate conda-testenv virtual environment ``` -You can control where a conda environment -lives by providing a path to a target directory when creating the -environment. For example, the following command will create a new -environment in a workspace located in `scratch` +Now you can work in this isolated environment, without interfering with other tasks running on the +system. Note that the inscription (env) at the beginning of each line represents that you are in +the virtual environment. You can deactivate the conda environment as follows: -```Bash -conda create --prefix /scratch/ws/<name_of_your_workspace>/conda-virtual-environment/<name_of_your_environment> +```console +(conda-testenv) marie@compute$ conda deactivate #Leave the virtual environment ``` -Please pay attention, -using srun directly on the shell will lead to blocking and launch an -interactive job. Apart from short test runs, it is **recommended to -launch your jobs into the background by using Slurm**. For that, you can conveniently put -the parameters directly into the job file which you can submit using -`sbatch [options] <job file>.` +TODO: Link to this page from other DA/ML topics. insert link in alpha centauri + +??? example + This is an example on alpha partition. The example creates a virtual environment, and installs + the package `torchvision` with pip. + ```console + marie@login$ srun --partition=alpha-interactive -N=1 --gres=gpu:1 --time=01:00:00 --pty bash + marie@alpha$ mkdir python-environments # please use workspaces + marie@alpha$ module load modenv/hiera GCC/10.2.0 CUDA/11.1.1 OpenMPI/4.0.5 PyTorch + Module GCC/10.2.0, CUDA/11.1.1, OpenMPI/4.0.5, PyTorch/1.9.0 and 54 dependencies loaded. + marie@alpha$ which python + /sw/installed/Python/3.8.6-GCCcore-10.2.0/bin/python + marie@alpha$ pip list + [...] + marie@alpha$ virtualenv --system-site-packages python-environments/my-torch-env + created virtual environment CPython3.8.6.final.0-64 in 42960ms + creator CPython3Posix(dest=~/python-environments/my-torch-env, clear=False, global=True) + seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=~/.local/share/virtualenv) + added seed packages: pip==21.1.3, setuptools==57.2.0, wheel==0.36.2 + activators BashActivator,CShellActivator,FishActivator,PowerShellActivator,PythonActivator,XonshActivator + marie@alpha$ source python-environments/my-torch-env/bin/activate + (my-torch-env) marie@alpha$ pip install torchvision + [...] + Installing collected packages: torchvision + Successfully installed torchvision-0.10.0 + [...] + (my-torch-env) marie@alpha$ python -c "import torchvision; print(torchvision.__version__)" + 0.10.0+cu102 + (my-torch-env) marie@alpha$ deactivate + ``` diff --git a/doc.zih.tu-dresden.de/docs/software/pytorch.md b/doc.zih.tu-dresden.de/docs/software/pytorch.md index 9bfadf38483140f03e8332c196bccbc52857f348..f8f0e6d0cbf780376397def082634980ecdb6b5a 100644 --- a/doc.zih.tu-dresden.de/docs/software/pytorch.md +++ b/doc.zih.tu-dresden.de/docs/software/pytorch.md @@ -2,9 +2,10 @@ [PyTorch](https://pytorch.org/){:target="_blank"} is an open-source machine learning framework. It is an optimized tensor library for deep learning using GPUs and CPUs. -PyTorch is a machine learning tool developed by Facebooks AI division to process large-scale object -detection, segmentation, classification, etc. PyTorch provides a core datastructure, the tensor, a -multi-dimensional array that shares many similarities with Numpy arrays. +PyTorch is a machine learning tool developed by Facebooks AI division to process large-scale +object detection, segmentation, classification, etc. +PyTorch provides a core datastructure, the tensor, a multi-dimensional array that shares many +similarities with Numpy arrays. Please check the software modules list via @@ -86,7 +87,7 @@ marie@ml$ python -c "import torch; print(torch.__version__)" In addition to using interactive and batch jobs, it is possible to work with Pytorch using JupyterHub. The production and test environments of JupyterHub contain Python kernels, that come with a Pytorch support. - +![Pytorch module in JupyterHub]**TODO**(misc/Pytorch_jupyter_module.png) {: align="center"} ## Distributed Pytorch diff --git a/doc.zih.tu-dresden.de/docs/software/software_development_overview.md b/doc.zih.tu-dresden.de/docs/software/software_development_overview.md index c87d4c93b5fe27ba82ca261aad359df48a7e741c..d2dd73ed3a56bc49d31123cec65bc8694e7f0f10 100644 --- a/doc.zih.tu-dresden.de/docs/software/software_development_overview.md +++ b/doc.zih.tu-dresden.de/docs/software/software_development_overview.md @@ -37,9 +37,7 @@ Some questions you should ask yourself: Subsections: - [Compilers](compilers.md) -- [Debugging Tools](Debugging Tools.md) - - [Debuggers](debuggers.md) (GDB, Allinea DDT, Totalview) - - [Tools to detect MPI usage errors](mpi_usage_error_detection.md) (MUST) +- [Debugging](debuggers.md) - PerformanceTools.md: [Score-P](scorep.md), [Vampir](vampir.md) - [Libraries](libraries.md) diff --git a/doc.zih.tu-dresden.de/docs/software/tensorboard.md b/doc.zih.tu-dresden.de/docs/software/tensorboard.md index 7272c7f2bbaa5da37f9a2e390812b26b51a17e34..a2da2b7c626839f59d3e4eda8d7a7a167b7f8ec4 100644 --- a/doc.zih.tu-dresden.de/docs/software/tensorboard.md +++ b/doc.zih.tu-dresden.de/docs/software/tensorboard.md @@ -8,26 +8,31 @@ whether a specific TensorFlow module provides TensorBoard, use the following com ```console marie@compute$ module spider TensorFlow/2.3.1 +[...] +Included extensions +[...] ``` If TensorBoard occurs in the `Included extensions` section of the output, TensorBoard is available. ## Using TensorBoard -To use TensorBoard, you have to connect via ssh to taurus as usual, schedule an interactive job and -load a TensorFlow module: +To use TensorBoard, you have to connect via ssh to the ZIH system as usual, schedule an interactive +job and load a TensorFlow module: ```console -marie@login$ srun -p alpha -n 1 -c 1 --pty --mem-per-cpu=8000 bash #Job submission on alpha node -marie@alpha$ module load TensorFlow/2.3.1 -marie@alpha$ tensorboard --logdir /scratch/gpfs/<YourNetID>/myproj/log --bind_all +marie@compute$ module load TensorFlow/2.3.1 +Module TensorFlow/2.3.1-fosscuda-2019b-Python-3.7.4 and 47 dependencies loaded. ``` Then create a workspace for the event data, that should be visualized in TensorBoard. If you already have an event data directory, you can skip that step. ```console -marie@alpha$ ws_allocate -F scratch tensorboard_logdata 1 +marie@compute$ ws_allocate -F scratch tensorboard_logdata 1 +Info: creating workspace. +/scratch/ws/1/marie-tensorboard_logdata +[...] ``` Now you can run your TensorFlow application. Note that you might have to adapt your code to make it @@ -35,10 +40,13 @@ accessible for TensorBoard. Please find further information on the official [Ten Then you can start TensorBoard and pass the directory of the event data: ```console -marie@alpha$ tensorboard --logdir /scratch/ws/1/marie-tensorboard_logdata --bind_all +marie@compute$ tensorboard --logdir /scratch/ws/1/marie-tensorboard_logdata --bind_all +[...] +TensorBoard 2.3.0 at http://taurusi8034.taurus.hrsk.tu-dresden.de:6006/ +[...] ``` -TensorBoard will then return a server address on taurus, e.g. `taurusi8034.taurus.hrsk.tu-dresden.de:6006` +TensorBoard will then return a server address on Taurus, e.g. `taurusi8034.taurus.hrsk.tu-dresden.de:6006` For accessing TensorBoard now, you have to set up some port forwarding via ssh to your local machine: @@ -47,6 +55,6 @@ machine: marie@local$ ssh -N -f -L 6006:taurusi8034.taurus.hrsk.tu-dresden.de:6006 <zih-login>@taurus.hrsk.tu-dresden.de ``` -Now you can see the tensorboard in your browser at `http://localhost:6006/`. +Now you can see the TensorBoard in your browser at `http://localhost:6006/`. -Note that you can also use tensorboard in an [sbatch file](../jobs_and_resources/batch_systems.md). +Note that you can also use TensorBoard in an [sbatch file](../jobs_and_resources/batch_systems.md). diff --git a/doc.zih.tu-dresden.de/docs/software/tensorflow.md b/doc.zih.tu-dresden.de/docs/software/tensorflow.md index 7f03a743ac6acba93bcc02e1c3c4b703bfe9cb38..363d096259f7b0a17acd280ba5a1cceff5a83103 100644 --- a/doc.zih.tu-dresden.de/docs/software/tensorflow.md +++ b/doc.zih.tu-dresden.de/docs/software/tensorflow.md @@ -9,6 +9,7 @@ Please check the software modules list via ```console marie@compute$ module spider TensorFlow +[...] ``` to find out, which TensorFlow modules are available on your partition. @@ -25,42 +26,40 @@ and the TensorFlow library. You can find detailed hardware specification On the **Alpha** partition load the module environment: ```console -marie@login$ srun -p alpha --gres=gpu:1 -n 1 -c 7 --pty --mem-per-cpu=8000 bash #Job submission on alpha nodes with 1 gpu on 1 node with 8000 Mb per CPU marie@alpha$ module load modenv/scs5 ``` On the **ML** partition load the module environment: ```console -marie@login$ srun -p ml --gres=gpu:1 -n 1 -c 7 --pty --mem-per-cpu=8000 bash #Job submission in ml nodes with 1 gpu on 1 node with 8000 Mb per CPU -marie@ml$ module load modenv/ml #example output: The following have been reloaded with a version change: 1) modenv/scs5 => modenv/ml +marie@ml$ module load modenv/ml +The following have been reloaded with a version change: 1) modenv/scs5 => modenv/ml ``` This example shows how to install and start working with TensorFlow (with using modules system) ```console marie@ml$ module load TensorFlow -Module TensorFlow/1.10.0-PythonAnaconda-3.6 and 1 dependency loaded. +Module TensorFlow/2.3.1-fosscuda-2019b-Python-3.7.4 and 47 dependencies loaded. ``` -Now we check that we can access TensorFlow. One example is tensorflow-test: - -```console -marie@ml$ tensorflow-test -Basic test of tensorflow - A Hello World!!!... -``` - -??? example - Following example shows how to create python virtual environment and import TensorFlow. +Now we can use TensorFlow. In the following example, we create a python virtual environment and +import TensorFlow: +!!! example ```console - marie@ml$ mkdir python-environments #create folder + marie@ml$ ws_allocate -F scratch python_virtual_environment 1 + Info: creating workspace. + /scratch/ws/1/python_virtual_environment + [...] marie@ml$ which python #check which python are you using - /sw/installed/Python/3.7.4-GCCcore-8.3.0/bin/python - marie@ml$ virtualenv --system-site-packages python-environments/env #create virtual environment "env" which inheriting with global site packages + /sw/installed/Python/3.7.2-GCCcore-8.2.0 + marie@ml$ virtualenv --system-site-packages /scratch/ws/1/python_virtual_environment/env [...] - marie@ml$ source python-environments/env/bin/activate #activate virtual environment "env". Example output: (env) bash-4.2$ + marie@ml$ source /scratch/ws/1/python_virtual_environment/env/bin/activate marie@ml$ python -c "import tensorflow as tf; print(tf.__version__)" + [...] + 2.3.1 ``` ## TensorFlow in JupyterHub @@ -84,14 +83,14 @@ Another option to use TensorFlow are containers. In the HPC domain, the [Singularity](https://singularity.hpcng.org/) container system is a widely used tool. In the following example, we use the tensorflow-test in a Singularity container: -```console -marie@login$ srun -p ml --gres=gpu:1 -n 1 -c 7 --pty --mem-per-cpu=8000 bash +```console marie@ml$ singularity shell --nv /scratch/singularity/powerai-1.5.3-all-ubuntu16.04-py3.img -marie@ml$ export PATH=/opt/anaconda3/bin:$PATH -marie@ml$ source activate /opt/anaconda3 #activate conda environment -marie@ml$ . /opt/DL/tensorflow/bin/tensorflow-activate -marie@ml$ tensorflow-test +Singularity>$ export PATH=/opt/anaconda3/bin:$PATH +Singularity>$ source activate /opt/anaconda3 #activate conda environment +(base) Singularity>$ . /opt/DL/tensorflow/bin/tensorflow-activate +(base) Singularity>$ tensorflow-test Basic test of tensorflow - A Hello World!!!... +[...] ``` ## TensorFlow with Python or R @@ -123,11 +122,12 @@ tf_upgrade_v2 utility to help transition legacy code to the new API. ## Keras -[Keras](keras.io) is a high-level neural network API, written in Python and capable of running on -top of TensorFlow. Please check the software modules list via +[Keras](https://keras.io) is a high-level neural network API, written in Python and capable +of running on top of TensorFlow. Please check the software modules list via ```console marie@compute$ module spider Keras +[...] ``` to find out, which Keras modules are available on your partition. TensorFlow should be automatically diff --git a/doc.zih.tu-dresden.de/mkdocs.yml b/doc.zih.tu-dresden.de/mkdocs.yml index 782c943e79346da0e5d2902c097450b7a6f1a7d9..92a1e5bd3b2c72db59065b19ffb54f0e01a4a80a 100644 --- a/doc.zih.tu-dresden.de/mkdocs.yml +++ b/doc.zih.tu-dresden.de/mkdocs.yml @@ -62,11 +62,11 @@ nav: - Building Software: software/building_software.md - GPU Programming: software/gpu_programming.md - Compilers: software/compilers.md - - Debuggers: software/debuggers.md + - Debugging: + - Overview: software/debuggers.md + - MPI Error Detection: software/mpi_usage_error_detection.md - Libraries: software/libraries.md - - MPI Error Detection: software/mpi_usage_error_detection.md - Score-P: software/scorep.md - - PAPI Library: software/papi_library.md - Perf Tools: software/perf_tools.md - PIKA: software/pika.md - Vampir: software/vampir.md @@ -98,7 +98,6 @@ nav: - Taurus: jobs_and_resources/system_taurus.md - Slurm Examples: jobs_and_resources/slurm_examples.md - Slurm: jobs_and_resources/slurm.md - - HPC-DA: jobs_and_resources/hpcda.md - Binding And Distribution Of Tasks: jobs_and_resources/binding_and_distribution_of_tasks.md # - Queue Policy: jobs/policy.md # - Examples: jobs/examples/index.md @@ -114,32 +113,26 @@ nav: - Overview: archive/overview.md - Bio Informatics: archive/bioinformatics.md - CXFS End of Support: archive/cxfs_end_of_support.md - - Debugging Tools: archive/debugging_tools.md - - Hardware: archive/hardware.md - - Hardware Altix: archive/hardware_altix.md - - Hardware Atlas: archive/hardware_atlas.md - - Hardware Deimos: archive/hardware_deimos.md - - Hardware Phobos: archive/hardware_phobos.md - - Hardware Titan: archive/hardware_titan.md - - Hardware Triton: archive/hardware_triton.md - - Hardware Venus: archive/hardware_venus.md - - Introduction: archive/introduction.md - KNL Nodes: archive/knl_nodes.md + - Manual Jupyter Installation: archive/install_jupyter.md - Load Leveler: archive/load_leveler.md - Migrate to Atlas: archive/migrate_to_atlas.md - No IB Jobs: archive/no_ib_jobs.md - Phase2 Migration: archive/phase2_migration.md - Platform LSF: archive/platform_lsf.md - - RamDisk Documentation: archive/ram_disk_documentation.md - - Step by Step Taurus: archive/step_by_step_taurus.md - - System Altix: archive/system_altix.md - - System Atlas: archive/system_atlas.md - - System Venus: archive/system_venus.md + - Switched-Off Systems: + - Overview: archive/systems_switched_off.md + - System Altix: archive/system_altix.md + - System Atlas: archive/system_atlas.md + - System Deimos: archive/system_deimos.md + - System Phobos: archive/system_phobos.md + - System Titan: archive/system_titan.md + - System Triton: archive/system_triton.md + - System Venus: archive/system_venus.md - Taurus II: archive/taurus_ii.md - UNICORE Rest API: archive/unicore_rest_api.md - Vampir Trace: archive/vampir_trace.md - - Venus Open: archive/venus_open.md - - Windows Batchjobs: jobs/windows_batch.md + - Windows Batchjobs: archive/windows_batch.md # Project Information @@ -187,9 +180,11 @@ extra: zih_homepage: https://tu-dresden.de/zih # links in footer footer: - - link: https://doc.zih.tu-dresden.de/hpc-wiki/bin/view/Compendium/Impressum - name: "Legal Notice" - - link: https://doc.zih.tu-dresden.de/hpc-wiki/bin/view/Compendium/Accessibility - name: "Accessibility" + - link: /legal_notice + name: "Legal Notice / Impressum" + - link: /accessibility + name: "Accessibility / Barrierefreiheit" + - link: /data_protection_declaration + name: "Data Protection Declaration / Datenschutzerklärung" - link: https://tu-dresden.de/zertifikate name: "Certificates" diff --git a/doc.zih.tu-dresden.de/tud_theme/stylesheets/extra.css b/doc.zih.tu-dresden.de/tud_theme/stylesheets/extra.css index df1ee3fd5b8a9265a4090ee9a648f73267e568e3..72e3129d58c81e1331eaf9168c734a8bbd519621 100644 --- a/doc.zih.tu-dresden.de/tud_theme/stylesheets/extra.css +++ b/doc.zih.tu-dresden.de/tud_theme/stylesheets/extra.css @@ -13,7 +13,6 @@ } :root { --md-text-font-family: 'Open Sans Regular', sans-serif; - --md-primary-fg-color: rgb(0, 37, 87); --md-primary-fg-color--light: rgb(39, 66, 117); --md-footer-bg-color: rgb(0, 37, 87); @@ -69,9 +68,11 @@ strong { .md-typeset code { word-break: normal; - font-size: 0.8rem; } -.md-typeset .admonition { + +.md-typeset .admonition, +.md-typeset details, +.md-typeset code { font-size: 0.8rem; } diff --git a/doc.zih.tu-dresden.de/util/check-spelling.sh b/doc.zih.tu-dresden.de/util/check-spelling.sh index 327b29ec1a80d1a361b8be4bdde2e1a93bf0e981..7fa9d2824d4a61ce86ae258d656acfe90c574269 100755 --- a/doc.zih.tu-dresden.de/util/check-spelling.sh +++ b/doc.zih.tu-dresden.de/util/check-spelling.sh @@ -1,28 +1,88 @@ #!/bin/bash +set -euo pipefail + scriptpath=${BASH_SOURCE[0]} basedir=`dirname "$scriptpath"` basedir=`dirname "$basedir"` -wordlistfile=$basedir/wordlist.aspell -acmd="aspell -p $wordlistfile --ignore 2 -l en_US list" - -function spell_check () { - file_to_check=$1 - ret=$(cat "$file_to_check" | $acmd) - if [ ! -z "$ret" ]; then - echo "-- File $file_to_check" - echo "$ret" | sort -u - fi -} +wordlistfile=$(realpath $basedir/wordlist.aspell) +branch="origin/${CI_MERGE_REQUEST_TARGET_BRANCH_NAME:-preview}" +aspellmode= +if aspell dump modes | grep -q markdown; then + aspellmode="--mode=markdown" +fi function usage() { cat <<-EOF usage: $0 [file] -Outputs all words of the file (or, if no argument given, all files in the current directory, recursively), that the spell checker cannot recognize. +If file is given, outputs all words of the file, that the spell checker cannot recognize. +If file is omitted, checks whether any changed file contains more unrecognizable words than before the change. If you are sure a word is correct, you can put it in $wordlistfile. EOF } +function getAspellOutput(){ + aspell -p "$wordlistfile" --ignore 2 -l en_US $aspellmode list | sort -u +} + +function getNumberOfAspellOutputLines(){ + getAspellOutput | wc -l +} + +function isMistakeCountIncreasedByChanges(){ + any_fails=false + + #Unfortunately, sort depends on locale and docker does not provide much. + #Therefore, it uses bytewise comparison. We avoid problems with the command tr. + if ! sed 1d "$wordlistfile" | tr [:upper:] [:lower:] | sort -C; then + echo "Unsorted wordlist in $wordlistfile" + any_fails=true + fi + + source_hash=`git merge-base HEAD "$branch"` + #Remove everything except lines beginning with --- or +++ + files=`git diff $source_hash | sed -E -n 's#^(---|\+\+\+) ((/|./)[^[:space:]]+)$#\2#p'` + #echo "$files" + #echo "-------------------------" + #Assume that we have pairs of lines (starting with --- and +++). + while read oldfile; do + read newfile + if [ "${newfile: -3}" == ".md" ]; then + if [[ $newfile == *"accessibility.md"* || + $newfile == *"data_protection_declaration.md"* || + $newfile == *"legal_notice.md"* ]]; then + echo "Skip $newfile" + else + echo "Check $newfile" + if [ "$oldfile" == "/dev/null" ]; then + #Added files should not introduce new spelling mistakes + previous_count=0 + else + previous_count=`git show "$source_hash:${oldfile:2}" | getNumberOfAspellOutputLines` + fi + if [ "$newfile" == "/dev/null" ]; then + #Deleted files do not contain any spelling mistakes + current_count=0 + else + #Remove the prefix "b/" + newfile=${newfile:2} + current_count=`cat "$newfile" | getNumberOfAspellOutputLines` + fi + if [ $current_count -gt $previous_count ]; then + echo "-- File $newfile" + echo "Change increases spelling mistake count (from $previous_count to $current_count)" + any_fails=true + fi + fi + fi + done <<< "$files" + + if [ "$any_fails" == true ]; then + return 1 + fi + return 0 +} + if [ $# -eq 1 ]; then case $1 in help | -help | --help) @@ -30,13 +90,11 @@ if [ $# -eq 1 ]; then exit ;; *) - spell_check $1 + cat "$1" | getAspellOutput ;; esac elif [ $# -eq 0 ]; then - for i in `find -name \*.md`; do - spell_check $i - done + isMistakeCountIncreasedByChanges else usage fi diff --git a/doc.zih.tu-dresden.de/util/grep-forbidden-words.sh b/doc.zih.tu-dresden.de/util/grep-forbidden-words.sh new file mode 100755 index 0000000000000000000000000000000000000000..b6d586220052a2bf362aec3c4736c876e4901da6 --- /dev/null +++ b/doc.zih.tu-dresden.de/util/grep-forbidden-words.sh @@ -0,0 +1,109 @@ +#!/bin/bash + +set -euo pipefail + +scriptpath=${BASH_SOURCE[0]} +basedir=`dirname "$scriptpath"` +basedir=`dirname "$basedir"` + +#This is the ruleset. Each line represents a rule of tab-separated fields. +#The first field represents whether the match should be case-sensitive (s) or insensitive (i). +#The second field represents the pattern that should not be contained in any file that is checked. +#Further fields represent patterns with exceptions. +#For example, the first rule says: +# The pattern \<io\> should not be present in any file (case-insensitive match), except when it appears as ".io". +ruleset="i \<io\> \.io +s \<SLURM\> +i file \+system +i \<taurus\> taurus\.hrsk /taurus +i \<hrskii\> +i hpc \+system +i hpc[ -]\+da\> +i work[ -]\+space" + +function grepExceptions () { + if [ $# -gt 0 ]; then + firstPattern=$1 + shift + grep -v "$firstPattern" | grepExceptions "$@" + else + cat - + fi +} + +function usage () { + echo "$0 [options]" + echo "Search forbidden patterns in markdown files." + echo "" + echo "Options:" + echo " -a Search in all markdown files (default: git-changed files)" + echo " -s Silent mode" + echo " -h Show help message" +} + +# Options +all_files=false +silent=false +while getopts ":ahs" option; do + case $option in + a) + all_files=true + ;; + s) + silent=true + ;; + h) + usage + exit;; + \?) # Invalid option + echo "Error: Invalid option." + usage + exit;; + esac +done + +branch="origin/${CI_MERGE_REQUEST_TARGET_BRANCH_NAME:-preview}" + +if [ $all_files = true ]; then + echo "Search in all markdown files." + files=$(git ls-tree --full-tree -r --name-only HEAD $basedir/docs/ | grep .md) +else + echo "Search in git-changed files." + files=`git diff --name-only "$(git merge-base HEAD "$branch")"` +fi + +cnt=0 +for f in $files; do + if [ "$f" != doc.zih.tu-dresden.de/README.md -a "${f: -3}" == ".md" -a -f "$f" ]; then + echo "Check wording in file $f" + while IFS=$'\t' read -r flags pattern exceptionPatterns; do + while IFS=$'\t' read -r -a exceptionPatternsArray; do + if [ $silent = false ]; then + echo " Pattern: $pattern" + fi + grepflag= + case "$flags" in + "i") + grepflag=-i + ;; + esac + if grep -n $grepflag "$pattern" "$f" | grepExceptions "${exceptionPatternsArray[@]}" ; then + ((cnt=cnt+1)) + fi + done <<< $exceptionPatterns + done <<< $ruleset + fi +done + +echo "" +case $cnt in + 1) + echo "Forbidden Patterns: 1 match found" + ;; + *) + echo "Forbidden Patterns: $cnt matches found" + ;; +esac +if [ $cnt -gt 0 ]; then + exit 1 +fi diff --git a/doc.zih.tu-dresden.de/wordlist.aspell b/doc.zih.tu-dresden.de/wordlist.aspell index 6d23d29110d57c85ecb248e0ac012652935c8022..4c4eff0d4fe4817a17b36356d408afcfb848a1ca 100644 --- a/doc.zih.tu-dresden.de/wordlist.aspell +++ b/doc.zih.tu-dresden.de/wordlist.aspell @@ -1,42 +1,169 @@ personal_ws-1.1 en 1805 +Altix analytics +APIs +BeeGFS benchmarking +broadwell +bsub +ccNUMA +centauri citable +conda CPU +CPUs CUDA +cuDNN +CXFS +dask +Dask +dataframes +DataFrames +DataParallel +DDP +DFG +DistributedDataParallel +DockerHub EasyBuild +env +ESSL +fastfs +filesystem +Filesystem +filesystems +Filesystems Flink +foreach +Fortran +GFLOPS +gfortran +gnuplot +Gnuplot GPU +GPUs hadoop +haswell Haswell HDFS Horovod +hostname HPC +hyperparameter +Hyperparameter +hyperparameters +icc +icpc +ifort ImageNet Infiniband +IOPS +Itanium +jobqueue +jpg Jupyter +JupyterHub +JupyterLab Keras +lapply +LoadLeveler +lsf +LSF +Mathematica +MEGWARE +MIMD +Miniconda +MKL +MNIST +Montecito +mountpoint +mpi MPI +mpicc +mpiCC +mpicxx +mpif +mpifort +mpirun +multicore +multithreaded +NCCL +Neptun +NFS +NRINGS +NUMA +NUMAlink +NumPy +OME +OmniOpt OPARI OpenACC OpenCL OpenMP +openmpi +OpenMPI +Opteron +overfitting PAPI +parallelization +parallelize +pdf +PESSL +PGI +pipelining +png +PowerAI +ppc +PSOCK +randint +README +Rmpi rome romeo +RSA +RStudio salloc +Saxonid sbatch ScaDS Scalasca scancel +Scikit +SciPy scontrol scp +SGI +SHA SHMEM +SLES Slurm +SMP +SMT squeue srun SSD +stderr +stdout +SUSE +TBB +TCP +TensorBoard TensorFlow +TFLOPS Theano +tmp +todo +ToDo +TODO +transferability +Trition Vampir +vectorization +venv +virtualenv +workspace +workspaces +XArray +XGBoost +XLC +XLF ZIH