From bde9d6454bf83f05f29736c6af7bd502ca6aec15 Mon Sep 17 00:00:00 2001 From: Natalie Breidenbach <natalie.breidenbach@tu-dresden.de> Date: Tue, 28 Nov 2023 13:31:04 +0100 Subject: [PATCH] Update ngc_containers.md --- .../docs/software/ngc_containers.md | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/doc.zih.tu-dresden.de/docs/software/ngc_containers.md b/doc.zih.tu-dresden.de/docs/software/ngc_containers.md index f19612d9a..a15020381 100644 --- a/doc.zih.tu-dresden.de/docs/software/ngc_containers.md +++ b/doc.zih.tu-dresden.de/docs/software/ngc_containers.md @@ -50,14 +50,14 @@ If you are not familiar with Singularity's syntax, please find the information o However, some main commands will be explained. Create a container from the image from the NGC catalog. -(For this example, the alpha is used): +(For this example, the cluster alpha is used): ```console -marie@login$ srun --partition=alpha --nodes=1 --ntasks-per-node=1 --ntasks=1 --gres=gpu:1 --time=08:00:00 --pty --mem=50000 bash +marie@login.alpha$ srun --nodes=1 --ntasks-per-node=1 --ntasks=1 --gres=gpu:1 --time=08:00:00 --pty --mem=50000 bash -marie@compute$ cd /scratch/ws/<name_of_your_workspace>/containers #please create a Workspace +marie@alpha$ cd /data/horse/ws/<name_of_your_workspace>/containers #please create a Workspace -marie@compute$ singularity pull pytorch:21.08-py3.sif docker://nvcr.io/nvidia/pytorch:21.08-py3 +marie@alpha$ singularity pull pytorch:21.08-py3.sif docker://nvcr.io/nvidia/pytorch:21.08-py3 ``` Now, you have a fully functional PyTorch container. @@ -73,20 +73,20 @@ To download the dataset, please follow the Also, you can find the instructions in a README file which you can find inside the container: ```console -marie@compute$ singularity exec pytorch:21.06-py3_beegfs vim /workspace/examples/resnet50v1.5/README.md +marie@alpha$ singularity exec pytorch:21.06-py3_beegfs vim /workspace/examples/resnet50v1.5/README.md ``` It is recommended to run the container with a single command. However, for the educational purpose, the separate commands will be presented below: ```console -marie@login$ srun --partition=alpha --nodes=1 --ntasks-per-node=1 --ntasks=1 --gres=gpu:1 --time=08:00:00 --pty --mem=50000 bash +marie@login.alpha$ srun --nodes=1 --ntasks-per-node=1 --ntasks=1 --gres=gpu:1 --time=08:00:00 --pty --mem=50000 bash ``` Run a shell within a container with the `singularity shell` command: ```console -marie@compute$ singularity shell --nv -B /scratch/imagenet:/data/imagenet pytorch:21.06-py3 +marie@alpha$ singularity shell --nv -B /data/horse/imagenet:/data/imagenet pytorch:21.06-py3 ``` The flag `--nv` in the command above was used to enable Nvidia support for GPU usage @@ -112,8 +112,8 @@ As an example, please find the full command to run the ResNet50 model on the ImageNet dataset inside the PyTorch container: ```console -marie@login$ srun --partition=alpha --nodes=1 --ntasks-per-node=1 --ntasks=1 --gres=gpu:1 --time=08:00:00 --pty --mem=50000 \ - singularity exec --nv -B /scratch/ws/0/anpo879a-ImgNet/imagenet:/data/imagenet pytorch:21.06-py3 \ +marie@login.alpha$ srun --nodes=1 --ntasks-per-node=1 --ntasks=1 --gres=gpu:1 --time=08:00:00 --pty --mem=50000 \ + singularity exec --nv -B /data/horse/ws/0/anpo879a-ImgNet/imagenet:/data/imagenet pytorch:21.06-py3 \ python /workspace/examples/resnet50v1.5/multiproc.py --nnodes=1 --nproc_per_node 1 \ --node_rank=0 /workspace/examples/resnet50v1.5/main.py --data-backend dali-cpu --raport-file raport.json \ -j16 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 \ @@ -136,11 +136,11 @@ An example of using the PyTorch container for the training of the ResNet50 model on the classification task on the ImageNet dataset is presented below: ```console -marie@login$ srun --partition=alpha --nodes=1 --ntasks-per-node=8 --ntasks=8 --gres=gpu:8 --time=08:00:00 --pty --mem=700G bash +marie@login.alpha$ srun --nodes=1 --ntasks-per-node=8 --ntasks=8 --gres=gpu:8 --time=08:00:00 --pty --mem=700G bash ``` ```console -marie@alpha$ singularity exec --nv -B /scratch/ws/0/marie-ImgNet/imagenet:/data/imagenet pytorch:21.06-py3 \ +marie@alpha$ singularity exec --nv -B /data/horse/ws/0/marie-ImgNet/imagenet:/data/imagenet pytorch:21.06-py3 \ python /workspace/examples/resnet50v1.5/multiproc.py --nnodes=1 --nproc_per_node 8 \ --node_rank=0 /workspace/examples/resnet50v1.5/main.py --data-backend dali-cpu \ --raport-file raport.json -j16 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 \ -- GitLab