diff --git a/doc.zih.tu-dresden.de/docs/software/cfd.md b/doc.zih.tu-dresden.de/docs/software/cfd.md index 492cb96d24f3761e2820fdba34eaa6b0a35db320..186d7b3a5a97a2daf06d8618c7c91dc91d7ab971 100644 --- a/doc.zih.tu-dresden.de/docs/software/cfd.md +++ b/doc.zih.tu-dresden.de/docs/software/cfd.md @@ -42,7 +42,7 @@ marie@login$ # source $FOAM_CSH module load OpenFOAM source $FOAM_BASH cd /scratch/ws/1/marie-example-workspace # work directory using workspace - srun pimpleFoam -parallel > "$OUTFILE" + srun pimpleFoam -parallel > "$OUTFILE" ``` ## Ansys CFX @@ -62,7 +62,7 @@ geometry and mesh generator cfx5pre, and the post-processor cfx5post. module load ANSYS cd /scratch/ws/1/marie-example-workspace # work directory using workspace - cfx-parallel.sh -double -def StaticMixer.def + cfx-parallel.sh -double -def StaticMixer.def ``` ## Ansys Fluent diff --git a/doc.zih.tu-dresden.de/docs/software/data_analytics_with_python.md b/doc.zih.tu-dresden.de/docs/software/data_analytics_with_python.md index bc9ac622530f2b355adef7337fb5d49447d79be1..00ce0c5c4c3ddbd3654161bab69ee0a493cb4350 100644 --- a/doc.zih.tu-dresden.de/docs/software/data_analytics_with_python.md +++ b/doc.zih.tu-dresden.de/docs/software/data_analytics_with_python.md @@ -212,11 +212,11 @@ for the partition `alpha` (queue at the dask terms) on the ZIH system: ```python from dask_jobqueue import SLURMCluster -cluster = SLURMCluster(queue='alpha', +cluster = SLURMCluster(queue='alpha', cores=8, - processes=2, - project='p_marie', - memory="8GB", + processes=2, + project='p_marie', + memory="8GB", walltime="00:30:00") ``` @@ -235,15 +235,15 @@ from distributed import Client from dask_jobqueue import SLURMCluster from dask import delayed -cluster = SLURMCluster(queue='alpha', +cluster = SLURMCluster(queue='alpha', cores=8, - processes=2, - project='p_marie', - memory="80GB", + processes=2, + project='p_marie', + memory="80GB", walltime="00:30:00", extra=['--resources gpu=1']) -cluster.scale(2) #scale it to 2 workers! +cluster.scale(2) #scale it to 2 workers! client = Client(cluster) #command will show you number of workers (python objects corresponds to jobs) ``` @@ -288,7 +288,7 @@ for the Monte-Carlo estimation of Pi. uid = int( sp.check_output('id -u', shell=True).decode('utf-8').replace('\n','') ) portdash = 10001 + uid - #create a Slurm cluster, please specify your project + #create a Slurm cluster, please specify your project cluster = SLURMCluster(queue='alpha', cores=2, project='p_marie', memory="8GB", walltime="00:30:00", extra=['--resources gpu=1'], scheduler_options={"dashboard_address": f":{portdash}"}) @@ -309,12 +309,12 @@ for the Monte-Carlo estimation of Pi. def calc_pi_mc(size_in_bytes, chunksize_in_bytes=200e6): """Calculate PI using a Monte Carlo estimate.""" - + size = int(size_in_bytes / 8) chunksize = int(chunksize_in_bytes / 8) - + xy = da.random.uniform(0, 1, size=(size / 2, 2), chunks=(chunksize / 2, 2)) - + in_circle = ((xy ** 2).sum(axis=-1) < 1) pi = 4 * in_circle.mean() @@ -327,11 +327,11 @@ for the Monte-Carlo estimation of Pi. f"\tErr: {abs(pi - np.pi) : 10.3e}\n" f"\tWorkers: {num_workers}" f"\t\tTime: {time_delta : 7.3f}s") - + #let's loop over different volumes of double-precision random numbers and estimate it for size in (1e9 * n for n in (1, 10, 100)): - + start = time() pi = calc_pi_mc(size).compute() elaps = time() - start @@ -339,7 +339,7 @@ for the Monte-Carlo estimation of Pi. print_pi_stats(size, pi, time_delta=elaps, num_workers=len(cluster.scheduler.workers)) #Scaling the Cluster to twice its size and re-run the experiments - + new_num_workers = 2 * len(cluster.scheduler.workers) print(f"Scaling from {len(cluster.scheduler.workers)} to {new_num_workers} workers.") @@ -349,11 +349,11 @@ for the Monte-Carlo estimation of Pi. sleep(120) client - + #Re-run same experiments with doubled cluster - for size in (1e9 * n for n in (1, 10, 100)): - + for size in (1e9 * n for n in (1, 10, 100)): + start = time() pi = calc_pi_mc(size).compute() elaps = time() - start diff --git a/doc.zih.tu-dresden.de/docs/software/distributed_training.md b/doc.zih.tu-dresden.de/docs/software/distributed_training.md index bd45768f67c862b2a0137bd2a1656723fa6dfd91..1008e33f6a60ba3b4b189deeae2d0f2b14066ffd 100644 --- a/doc.zih.tu-dresden.de/docs/software/distributed_training.md +++ b/doc.zih.tu-dresden.de/docs/software/distributed_training.md @@ -183,7 +183,7 @@ DDP uses collective communications in the [torch.distributed](https://pytorch.org/tutorials/intermediate/dist_tuto.html) package to synchronize gradients and buffers. -The tutorial can be found [here](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html). +Please also look at the [official tutorial](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html). To use distributed data parallelism on ZIH systems, please make sure the `--ntasks-per-node` parameter is equal to the number of GPUs you use per node. @@ -234,7 +234,7 @@ marie@compute$ module spider Horovod # Check available modules Horovod/0.19.5-fosscuda-2019b-TensorFlow-2.2.0-Python-3.7.4 Horovod/0.21.1-TensorFlow-2.4.1 [...] -marie@compute$ module load Horovod/0.19.5-fosscuda-2019b-TensorFlow-2.2.0-Python-3.7.4 +marie@compute$ module load Horovod/0.19.5-fosscuda-2019b-TensorFlow-2.2.0-Python-3.7.4 ``` Or if you want to use Horovod on the partition `alpha`, you can load it with the dependencies: diff --git a/doc.zih.tu-dresden.de/docs/software/ngc_containers.md b/doc.zih.tu-dresden.de/docs/software/ngc_containers.md index 835259ce9d6ff5bb48912911f5f02bae7d449596..f19612d9a3310f869a483c20328d51168317552a 100644 --- a/doc.zih.tu-dresden.de/docs/software/ngc_containers.md +++ b/doc.zih.tu-dresden.de/docs/software/ngc_containers.md @@ -53,7 +53,7 @@ Create a container from the image from the NGC catalog. (For this example, the alpha is used): ```console -marie@login$ srun --partition=alpha --nodes=1 --ntasks-per-node=1 --ntasks=1 --gres=gpu:1 --time=08:00:00 --pty --mem=50000 bash +marie@login$ srun --partition=alpha --nodes=1 --ntasks-per-node=1 --ntasks=1 --gres=gpu:1 --time=08:00:00 --pty --mem=50000 bash marie@compute$ cd /scratch/ws/<name_of_your_workspace>/containers #please create a Workspace diff --git a/doc.zih.tu-dresden.de/docs/software/perf_tools.md b/doc.zih.tu-dresden.de/docs/software/perf_tools.md index 16007698726b0430f84ef20acc80cb9e1766d64d..83398f49cb68a3255e051ae866a3679124559bef 100644 --- a/doc.zih.tu-dresden.de/docs/software/perf_tools.md +++ b/doc.zih.tu-dresden.de/docs/software/perf_tools.md @@ -1,8 +1,8 @@ # Introduction `perf` consists of two parts: the kernel space implementation and the userland tools. This wiki -entry focusses on the latter. These tools are installed on taurus, and others and provides support -for sampling applications and reading performance counters. +entry focusses on the latter. These tools are installed on ZIH systems, and others and provides +support for sampling applications and reading performance counters. ## Configuration @@ -34,18 +34,18 @@ Run `perf stat <Your application>`. This will provide you with a general overview on some counters. ```Bash -Performance counter stats for 'ls':= - 2,524235 task-clock # 0,352 CPUs utilized - 15 context-switches # 0,006 M/sec - 0 CPU-migrations # 0,000 M/sec - 292 page-faults # 0,116 M/sec - 6.431.241 cycles # 2,548 GHz - 3.537.620 stalled-cycles-frontend # 55,01% frontend cycles idle - 2.634.293 stalled-cycles-backend # 40,96% backend cycles idle - 6.157.440 instructions # 0,96 insns per cycle - # 0,57 stalled cycles per insn - 1.248.527 branches # 494,616 M/sec - 34.044 branch-misses # 2,73% of all branches +Performance counter stats for 'ls':= + 2,524235 task-clock # 0,352 CPUs utilized + 15 context-switches # 0,006 M/sec + 0 CPU-migrations # 0,000 M/sec + 292 page-faults # 0,116 M/sec + 6.431.241 cycles # 2,548 GHz + 3.537.620 stalled-cycles-frontend # 55,01% frontend cycles idle + 2.634.293 stalled-cycles-backend # 40,96% backend cycles idle + 6.157.440 instructions # 0,96 insns per cycle + # 0,57 stalled cycles per insn + 1.248.527 branches # 494,616 M/sec + 34.044 branch-misses # 2,73% of all branches 0,007167707 seconds time elapsed ``` @@ -142,10 +142,10 @@ If you added a callchain, it also gives you a callchain profile.\<br /> \*Discla not an appropriate way to gain exact numbers. So this is merely a rough overview and not guaranteed to be absolutely correct.\*\<span style="font-size: 1em;"> \</span> -### On Taurus +### On ZIH systems -On Taurus, users are not allowed to see the kernel functions. If you have multiple events defined, -then the first thing you select in `perf report` is the type of event. Press right +On ZIH systems, users are not allowed to see the kernel functions. If you have multiple events +defined, then the first thing you select in `perf report` is the type of event. Press right ```Bash Available samples @@ -165,7 +165,7 @@ If you'd select cycles, you would get such a screen: ```Bash Events: 96 cycles + 49,13% test_gcc_perf test_gcc_perf [.] main.omp_fn.0 -+ 34,48% test_gcc_perf test_gcc_perf [.] ++ 34,48% test_gcc_perf test_gcc_perf [.] + 6,92% test_gcc_perf test_gcc_perf [.] omp_get_thread_num@plt + 5,20% test_gcc_perf libgomp.so.1.0.0 [.] omp_get_thread_num + 2,25% test_gcc_perf test_gcc_perf [.] main.omp_fn.1