Added configuration as attachments and links

5ba2c88a · Noah Löwer · 71cef17f · 5ba2c88a · 5ba2c88a · 5ba2c88a
Commit 5ba2c88a authored 2 years ago by Noah Löwer
--- a/doc.zih.tu-dresden.de/docs/software/misc/spec_gnu-taurus.cfg
+++ b/doc.zih.tu-dresden.de/docs/software/misc/spec_gnu-taurus.cfg
+#######################################################################
+# Example configuration file for the GNU Compilers
+#
+# Defines: "model" => "mpi", "acc", "omp", "tgt", "tgtgpu"  default "mpi"
+#          "label" => ext base label, default "nv"
+#
+# MPI-only Command:
+# runhpc -c Example_gnu --reportable -T base --define model=mpi --ranks=40 small
+#
+# OpenACC Command:
+# runhpc -c Example_gnu --reportable -T base --define model=acc --ranks=4  small
+#
+# OpenMP Command:
+# runhpc -c Example_gnu --reportable -T base --define model=omp --ranks=1 --threads=40 small
+#
+# OpenMP Target Offload to Host Command:
+# runhpc -c Example_gnu --reportable -T base --define model=tgt --ranks=1 --threads=40 small
+#
+# OpenMP Target Offload to NVIDIA GPU Command:
+# runhpc -c Example_gnu --reportable -T base --define model=tgtnv --ranks=4  small
+#
+#######################################################################
+
+%ifndef %{label}         # IF label is not set use gnu 
+%   define label gnu    
+%endif              
+
+%ifndef %{model}         # IF model is not set use mpi
+%   define model mpi      
+%endif              
+
+teeout = yes
+makeflags=-j 24
+
+# Tester Information 
+license_num     = 37A
+tester          = Technische Universitaet Dresden
+test_sponsor    = Technische Universitaet Dresden
+prepared_by     = Holger Brunst
+
+######################################################
+# SUT Section
+######################################################
+#include: Example_SUT.inc
+include: sut-taurus.inc
+
+#[Software]
+sw_compiler000   = C/C++/Fortran: Version 8.2.0 of
+sw_compiler001   = GNU Compilers
+sw_mpi_library = OpenMPI Version 3.1.3
+sw_mpi_other = None
+sw_other = None
+
+#[General notes]
+notes_000 = MPI startup command:
+notes_005 =   slurm srun command was used to start MPI jobs.
+
+#######################################################################
+# End of SUT section
+#######################################################################
+
+#######################################################################
+# The header section of the config file.  Must appear
+# before any instances of "section markers" (see below)
+#
+# ext = how the binaries you generated will be identified
+# tune = specify "base" or "peak" or "all"
+%ifndef %{tudprof}
+label         = %{label}_%{model}
+%else
+label         = %{label}_%{model}_%{tudprof}
+%endif
+
+tune          = base
+output_format = text
+use_submit_for_speed = 1
+
+# Compiler Settings
+default:
+CC           = mpicc
+CXX          = mpicxx
+FC           = mpif90
+%if %{tudprof} eq 'scorep'
+CC           = scorep --mpp=mpi --instrument-filter=${SPEC}/scorep.filter mpicc
+CXX          = scorep --mpp=mpi --instrument-filter=${SPEC}/scorep.filter mpicxx
+FC           = scorep --mpp=mpi --instrument-filter=${SPEC}/scorep.filter mpif90
+%endif
+
+
+# Compiler Version Flags
+CC_VERSION_OPTION  = --version
+CXX_VERSION_OPTION = --version
+FC_VERSION_OPTION  = --version
+
+# enable non-official patches to this kit
+#strict_rundir_verify = 0
+
+# MPI options and binding environment, dependent upon Model being run
+# Adjust to match your system
+
+# OpenMP (CPU) Settings
+%if %{model} eq 'omp' 
+preENV_OMP_PROC_BIND=true
+preENV_OMP_PLACES=cores
+%endif
+
+#OpenMP Targeting Host Settings
+%if %{model} eq 'tgt' 
+preENV_OMP_PROC_BIND=true
+preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,226,27,28,29,30,31,32,33,34,35,36,37,38,39
+%endif
+
+#MPIRUN_OPTS = --bind-to none -q
+MPIRUN_OPTS=
+submit = timeout 2h srun ${MPIRUN_OPTS} -n $ranks -c $threads $command
+
+# MPI Workaround for mca issues in sph_exa
+#preOMPI_MCA_topo=basic
+
+# Score-P performance profiling
+%if %{tudprof} eq 'scorep'
+
+## only record calls to main and MPI functions (runtime filtering) 
+## runtime filtering was replaced by compile-time filtering (see above)
+# preENV_SCOREP_FILTERING_FILE=/home/brunst/ws-hpc2020/kit91/scorep.filter
+
+## set buffer memory size for profile/trace
+preENV_SCOREP_TOTAL_MEMORY=64MB
+
+## enable profile recording
+preENV_SCOREP_ENABLE_PROFILING=true
+
+## set to 'true' to enable detailed trace file recording 
+preENV_SCOREP_ENABLE_TRACING=false
+
+## collect memory consumption per node
+preENV_SCOREP_METRIC_RUSAGE=ru_maxrss
+
+## uncomment to record cycle counter for scheduling analysis 
+preENV_SCOREP_METRIC_PAPI=PAPI_TOT_CYC
+
+%endif
+
+
+#######################################################################
+# Optimization
+
+# Note that SPEC baseline rules require that all uses of a given compiler 
+# use the same flags in the same order. See the SPEChpc Run Rules
+# for more details 
+#      http://www.spec.org/hpc2021/Docs/runrules.html
+#
+# OPTIMIZE    = flags applicable to all compilers
+# FOPTIMIZE   = flags appliable to the Fortran compiler
+# COPTIMIZE   = flags appliable to the C compiler
+# CXXOPTIMIZE = flags appliable to the C++ compiler
+# 
+# See your compiler manual for information on the flags available
+# for your compiler
+
+# Compiler flags applied to all models
+default=base=default:
+COPTIMIZE     = -Ofast -march=native -lm        # use -mcpu=native for ARM
+CXXOPTIMIZE   = -Ofast -march=native -std=c++14
+FOPTIMIZE     = -Ofast -march=native -fno-stack-protector
+FPORTABILITY  = -ffree-line-length-none
+
+%if %{model} eq 'mpi'
+  pmodel=MPI
+%endif
+
+# OpenACC flags
+%if %{model} eq 'acc'
+  pmodel=ACC
+  OPTIMIZE += -fopenacc -foffload=-lm  
+%endif
+
+# OpenMP (CPU) flags
+%if %{model} eq 'omp' 
+  pmodel=OMP
+  OPTIMIZE += -fopenmp 
+%endif
+
+# OpenMP Targeting host flags
+%if %{model} eq 'tgt' 
+  pmodel=TGT
+  OPTIMIZE += -fopenmp 
+%endif
+
+# OpenMP Targeting Nvidia GPU flags
+%if %{model} eq 'tgtnv'
+  pmodel=TGT 
+  OPTIMIZE += -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda 
+%endif
+
+# No peak flags set, so make peak use the same flags as base 
+default=peak=default:
+basepeak=1
+
+#######################################################################
+# Portability
+#######################################################################
+
+
+519.clvleaf_t,619.clvleaf_s,719.clvleaf_m,819.clvleaf_l=default=default:
+# Not needed anymore?
+#PORTABILITY += -DSPEC_GNU_FLUSH
--- a/doc.zih.tu-dresden.de/docs/software/misc/spec_nvhpc-alpha.cfg
+++ b/doc.zih.tu-dresden.de/docs/software/misc/spec_nvhpc-alpha.cfg
+# Invocation command line:
+# runhpc -c nvhpc_alpha.cfg -ranks 8 --rebuild --define pmodel=acc --noreportable --tune=base --iterations=1 small
+#######################################################################
+# Example configuration file for the GNU Compilers
+#
+# Defines: "pmodel" => "mpi", "acc", "omp", "tgt", "tgtgpu"  default "mpi"
+#          "label" => ext base label, default "nv"
+#
+# MPI-only Command:
+# runhpc -c Example_gnu --reportable -T base --define pmodel=mpi --ranks=40 small
+#
+# OpenACC Command:
+# runhpc -c Example_gnu --reportable -T base --define pmodel=acc --ranks=4  small
+#
+# OpenMP Command:
+# runhpc -c Example_gnu --reportable -T base --define pmodel=omp --ranks=1 --threads=40 small
+#
+# OpenMP Target Offload to Host Command:
+# runhpc -c Example_gnu --reportable -T base --define pmodel=tgt --ranks=1 --threads=40 small
+#
+# OpenMP Target Offload to NVIDIA GPU Command:
+# runhpc -c Example_gnu --reportable -T base --define pmodel=tgtnv --ranks=4  small
+#
+#######################################################################
+
+%ifndef %{label}         # IF label is not set use gnu
+%   define label nv
+%endif
+
+%ifndef %{pmodel}         # IF pmodel is not set use mpi
+%   define pmodel mpi
+%endif
+
+teeout = yes
+makeflags=-j 40
+
+# Tester Information
+license_num     = 37A
+test_sponsor    = TU Dresden
+tester          = TU Dresden
+prepared_by     = Noah Trumpik (Noah.Trumpik@tu-dresden.de)
+
+
+#######################################################################
+# SUT Section
+#######################################################################
+
+# General SUT info
+system_vendor      = AMD
+system_name        = Alpha Centauri: AMD EPYC 7352 (AMD x86_64, NVIDIA A100-SXM4-40GB)
+hw_avail           = Jan-2019
+sw_avail           = Aug-2022
+
+#[Node_Description: Hardware]
+node_compute_syslbl = AMD Rome
+node_compute_order = 1
+node_compute_count = 34
+node_compute_purpose = compute
+node_compute_hw_vendor = AMD
+node_compute_hw_model = AMD K17 (Zen2)
+node_compute_hw_cpu_name = AMD EPYC 7352
+node_compute_hw_ncpuorder = 2 chips
+node_compute_hw_nchips = 2
+node_compute_hw_ncores = 96
+node_compute_hw_ncoresperchip = 48
+node_compute_hw_nthreadspercore = 2
+node_compute_hw_cpu_char = Up to 2.3 GHz
+node_compute_hw_cpu_mhz = 2100
+node_compute_hw_pcache = 32 KB I + 32 KB D on chip per core
+node_compute_hw_scache = 512 KB I+D on chip per core
+node_compute_hw_tcache000= 16384 KB I+D on chip per chip
+node_compute_hw_ocache = None
+node_compute_hw_memory = 1 TB
+node_compute_hw_disk000= 3.5 TB
+node_compute_hw_disk001 = NVMe SSD Controller SM981/PM981/PM983
+node_compute_hw_adapter_ib_model = Mellanox ConnectX-6
+node_compute_hw_adapter_ib_interconnect = EDR InfiniBand
+node_compute_hw_adapter_ib_firmware = 20.28.2006
+node_compute_hw_adapter_ib_driver = mlx5_core
+node_compute_hw_adapter_ib_data_rate = 200 Gb/s
+node_compute_hw_adapter_ib_count = 2
+node_compute_hw_adapter_ib_slot_type = PCIe
+node_compute_hw_adapter_ib_ports_used = 2
+node_compute_hw_other = None
+
+#[Node_Description: Accelerator]
+node_compute_hw_accel_model = Tesla A100-SXM4-40GB
+node_compute_hw_accel_count = 8
+node_compute_hw_accel_vendor = NVIDIA Corporation
+node_compute_sw_accel_driver = NVIDIA CUDA 470.57.02
+node_compute_hw_accel_type   = GPU
+node_compute_hw_accel_connect = ASPEED Technology, Inc. (rev 04)
+node_compute_hw_accel_ecc    = Yes
+node_compute_hw_accel_desc   = none
+
+#[Node_Description: Software]
+node_compute_sw_os000 = CentOS Linux
+node_compute_sw_os001 = 7
+node_compute_sw_localfile = xfs
+node_compute_sw_sharedfile000 = 4 PB Lustre parallel filesystem
+node_compute_sw_sharedfile001 = over 4X EDR InfiniBand
+node_compute_sw_state = Multi-user
+node_compute_sw_other = None
+
+#[Fileserver]
+
+#[Interconnect]
+interconnect_ib_syslbl = Mellanox InfiniBand
+interconnect_ib_purpose = MPI Traffic and GPFS access
+interconnect_ib_order = 1
+interconnect_ib_hw_vendor = Mellanox
+interconnect_ib_hw_topo = Non-blocking Fat-tree
+#interconnect_ib_hw_switch_ib_count = 2
+#interconnect_ib_hw_switch_ib_ports = 2
+#interconnect_ib_hw_switch_ib_data_rate = 100 Gb/s
+#interconnect_ib_hw_switch_ib_model = Mellanox Switch IB-2
+
+#[Software]
+sw_compiler000 = C/C++/Fortran: Version 21.7 of the
+sw_compiler001 = NVHPC toolkit
+sw_mpi_library = Open MPI Version 4.1.1
+sw_mpi_other = None
+system_class = Homogenous Cluster
+sw_other = CUDA Driver Version: 11.4.2
+
+#######################################################################
+# End of SUT section
+#######################################################################
+
+
+#######################################################################
+# The header section of the config file.  Must appear
+# before any instances of "section markers" (see below)
+#
+# ext = how the binaries you generated will be identified
+# tune = specify "base" or "peak" or "all"
+label         = %{label}_%{pmodel}
+tune          = base
+output_format = text
+use_submit_for_speed = 1
+
+# Compiler Settings
+default:
+CC           = mpicc
+CXX          = mpicxx
+FC           = mpif90
+# Compiler Version Flags
+CC_VERSION_OPTION  = --version
+CXX_VERSION_OPTION = --version
+FC_VERSION_OPTION  = --version
+
+# MPI options and binding environment, dependent upon Model being run
+# Adjust to match your system
+
+# OpenMP (CPU) Settings
+%if %{pmodel} eq 'omp'
+preENV_OMP_PLACES=cores
+#preENV_OMP_PROC_BIND=true
+#preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,226,27,28,29,30,31,32,33,34,35,36,37,38,39
+#preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,2\
+#3,24
+%endif
+
+#OpenMP Targeting Host Settings
+%if %{pmodel} eq 'tgt'
+#preENV_OMP_PROC_BIND=true
+preENV_MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
+preEnv_MPICH_GPU_SUPPORT_ENABLED=1
+preEnv_MPICH_SMP_SINGLE_COPY_MODE=CMA
+preEnv_MPICH_GPU_EAGER_DEVICE_MEM=0
+#preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,226,27,28,29,30,31,32,33,34,35,36,37,38,39
+#preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24
+%endif
+
+%ifdef %{ucx}
+# if using OpenMPI with UCX support, these settings are needed with use of CUDA Aware MPI
+# without these flags, LBM is known to hang when using OpenACC and OpenMP Target to GPUs
+preENV_UCX_MEMTYPE_CACHE=n
+preENV_UCX_TLS=self,shm,cuda_copy
+%endif
+
+#MPIRUN_OPTS = --bind-to none -q
+#submit = mpirun ${MPIRUN_OPTS} -n $ranks $command
+submit = srun $command
+
+#######################################################################
+# Optimization
+
+# Note that SPEC baseline rules require that all uses of a given compiler
+# use the same flags in the same order. See the SPEChpc Run Rules
+# for more details
+#      http://www.spec.org/hpc2021/Docs/runrules.html
+#
+# OPTIMIZE    = flags applicable to all compilers
+# FOPTIMIZE   = flags appliable to the Fortran compiler
+# COPTIMIZE   = flags appliable to the C compiler
+# CXXOPTIMIZE = flags appliable to the C++ compiler
+#
+# See your compiler manual for information on the flags available
+# for your compiler
+
+# Compiler flags applied to all models
+default=base=default:
+#OPTIMIZE     = -w -Mfprelaxed -Mnouniform -Mstack_arrays -fast
+OPTIMIZE     = -w -O3 -Mfprelaxed -Mnouniform -Mstack_arrays
+COPTIMIZE     = -lm       # use -mcpu=native for ARM
+CXXOPTIMIZE   = -std=c++11
+CXXPORTABILITY = --c++17
+
+#ARM
+%if %{armOn} eq 'arm'
+    COPTIMIZE += -mcpu=native
+    #OPTIMIZE += -mcpu=a64fx
+%endif
+
+# SVE
+%if %{sveOn} eq 'sve'
+  COPTIMIZE += -march=armv8-a+sve
+%endif
+
+%if %{model} eq 'mpi'
+  pmodel=MPI
+%endif
+
+# OpenACC flags
+%if %{pmodel} eq 'acc'
+  pmodel=ACC
+  # Use with PGI compiler only
+  # https://docs.nvidia.com/hpc-sdk/archive/21.7/
+  #OPTIMIZE += -acc=gpu
+  OPTIMIZE += -acc -ta=tesla -tp=zen #-Minfo=accel #-DSPEC_ACCEL_AWARE_MPI->hangs it forever
+
+#  513.soma_t:
+#  PORTABILITY+=-DSPEC_NO_VAR_ARRAY_REDUCE
+%endif
+
+# OpenMP (CPU) flags
+%if %{pmodel} eq 'omp'
+  pmodel=OMP
+  #OPTIMIZE += -qsmp=omp
+  OPTIMIZE += -fopenmp
+  #FOPTIMIZE +=
+%endif
+
+# OpenMP Targeting host flags
+%if %{pmodel} eq 'tgt'
+  pmodel=TGT
+  # PGI
+  OPTIMIZE += -mp -acc=multicore
+  # Intel??
+  # OPTIMIZE += -qsmp=omp -qoffload
+  # -fopen-simd
+  # GCC (doesn't recognize its own flags)
+  #OPTIMIZE += -fopenmp
+  #OPTIMIZE += -fopenmp -mgomp
+  #OPTIMIZE += -fopenmp -msoft-stack -muniform-simt
+  #FOPTIMIZE += -homp
+%endif
+
+# OpenMP Targeting host flags
+%if %{pmodel} eq 'tgtnv'
+  pmodel=TGT
+  # PGI
+  OPTIMIZE += -mp=gpu -acc
+  #FOPTIMIZE += -homp
+
+  # Note that NVHPC is in the process of adding OpenMP array
+  # reduction support so this option may be removed in future
+  513.soma_t:
+  PORTABILITY+=-DSPEC_NO_VAR_ARRAY_REDUCE
+%endif
+
+# No peak flags set, so make peak use the same flags as base
+default=peak=default:
+basepeak=1
+
+#######################################################################
+# Portability
+#######################################################################
+
+
+
+# The following section was added automatically, and contains settings that
+# did not appear in the original configuration file, but were added to the
+# raw file after the run.
+default:
+flagsurl000 = http://www.spec.org/hpc2021/flags/nv2021_flags.xml
+interconnect_ib_hw_switch_ib_model000 = Mellanox IB EDR Switch IB-2
--- a/doc.zih.tu-dresden.de/docs/software/misc/spec_nvhpc-ppc.cfg
+++ b/doc.zih.tu-dresden.de/docs/software/misc/spec_nvhpc-ppc.cfg
+# Invocation command line:
+# /lustre/scratch2/ws/0/notr584d-spec_nt/bin/runhpc --config nvhpc_ppc --define pmodel=acc --action run --nobuild --ranks=6 --reportable tiny
+# output_root was not used for this run
+#######################################################################
+# Example configuration file for the GNU Compilers
+#
+# Defines: "pmodel" => "mpi", "acc", "omp", "tgt", "tgtnv"  default "mpi"
+#          "label" => ext base label, default "nv"
+#
+# MPI-only Command:
+# runhpc -c Example_gnu --reportable -T base --define pmodel=mpi --ranks=40 small
+#
+# OpenACC Command:
+# runhpc -c Example_gnu --reportable -T base --define pmodel=acc --ranks=4  small
+#
+# OpenMP Command:
+# runhpc -c Example_gnu --reportable -T base --define pmodel=omp --ranks=1 --threads=40 small
+#
+# OpenMP Target Offload to Host Command:
+# runhpc -c Example_gnu --reportable -T base --define pmodel=tgt --ranks=1 --threads=40 small
+#
+# OpenMP Target Offload to NVIDIA GPU Command:
+# runhpc -c Example_gnu --reportable -T base --define pmodel=tgtnv --ranks=4  small
+#
+#######################################################################
+
+%ifndef %{label}         # IF label is not set use default "nv"
+%   define label nv
+%endif
+
+%ifndef %{pmodel}         # IF pmodel is not set use default mpi
+%   define pmodel mpi
+%endif
+
+teeout = yes
+makeflags=-j 40
+
+# Tester Information
+license_num     = 37A
+test_sponsor    = TU Dresden
+tester          = TU Dresden
+
+#######################################################################
+# SUT Section
+#######################################################################
+#include: Example_SUT.inc
+#  ----- Begin inclusion of 'Example_SUT.inc'
+#######################################################################
+# General SUT info
+system_vendor      = IBM
+system_name        = Taurus: IBM Power System AC922 (IBM Power9, Tesla V100-SXM2-32GB)
+node_compute_sw_accel_driver = NVIDIA CUDA 440.64.00
+node_compute_hw_adapter_ib_slot_type = None
+node_compute_hw_adapter_ib_ports_used = 2
+node_compute_hw_adapter_ib_model = Mellanox ConnectX-5
+node_compute_hw_adapter_ib_interconnect = EDR InfiniBand
+node_compute_hw_adapter_ib_firmware = 16.27.6008
+node_compute_hw_adapter_ib_driver = mlx5_core
+node_compute_hw_adapter_ib_data_rate = 100 Gb/s (4X EDR)
+node_compute_hw_adapter_ib_count = 2
+interconnect_ib_syslbl = Mellanox InfiniBand
+interconnect_ib_purpose = MPI Traffic and GPFS access
+interconnect_ib_order = 1
+#interconnect_ib_hw_vendor = Mellanox
+#interconnect_ib_hw_topo = Non-blocking Fat-tree
+#interconnect_ib_hw_switch_ib_ports = 36
+#interconnect_ib_hw_switch_ib_data_rate = 100 Gb/s
+#interconnect_ib_hw_switch_ib_count = 1
+#interconnect_ib_hw_model = Mellanox Switch IB-2
+hw_avail           = Nov-2018
+sw_avail           = Nov-2021
+prepared_by = Noah Trumpik (Noah.Trumpik@tu-dresden.de)
+
+#[Node_Description: Hardware]
+node_compute_syslbl = IBM Power System AC922
+node_compute_order = 1
+node_compute_count = 30
+node_compute_purpose = compute
+node_compute_hw_vendor = IBM
+node_compute_hw_model = IBM Power System AC922
+node_compute_hw_cpu_name = IBM POWER9 2.2 (pvr 004e 1202)
+node_compute_hw_ncpuorder = 2 chips
+node_compute_hw_nchips = 2
+node_compute_hw_ncores = 44
+node_compute_hw_ncoresperchip = 22
+node_compute_hw_nthreadspercore = 4
+node_compute_hw_cpu_char = Up to 3.8 GHz
+node_compute_hw_cpu_mhz = 2300
+node_compute_hw_pcache = 32 KB I + 32 KB D on chip per core
+node_compute_hw_scache = 512 KB I+D on chip per core
+node_compute_hw_tcache000= 10240 KB I+D on chip per chip
+node_compute_hw_ocache = None
+node_compute_hw_memory = 256 GB (16 x 16 GB RDIMM-DDR4-2666)
+node_compute_hw_disk000= 2 x 1 TB (ATA Rev BE35)
+node_compute_hw_disk001 = NVMe SSD Controller 172Xa/172Xb
+node_compute_hw_other = None
+
+#[Node_Description: Accelerator]
+node_compute_hw_accel_model = Tesla V100-SXM2-32GB
+node_compute_hw_accel_count = 6
+node_compute_hw_accel_vendor= NVIDIA Corporation
+node_compute_hw_accel_type  = GPU
+node_compute_hw_accel_connect = NVLINK
+node_compute_hw_accel_ecc    = Yes
+node_compute_hw_accel_desc   = See Notes
+
+#[Node_Description: Software]
+node_compute_sw_os000 = Red Hat Enterprise Linux
+node_compute_sw_os001 = 7.6
+node_compute_sw_localfile = xfs
+node_compute_sw_sharedfile = 4 PB Lustre parallel filesystem
+node_compute_sw_state = Multi-user
+node_compute_sw_other = None
+
+#[Fileserver]
+
+#[Interconnect]
+
+#[Software]
+sw_compiler000 = C/C++/Fortran: Version 21.5 of the
+sw_compiler001 = NVHPC toolkit
+sw_mpi_library = Open MPI Version 4.1.2
+sw_mpi_other = None
+system_class = Homogenous Cluster
+sw_other = None
+
+#[General notes]
+notes_000 = MPI startup command:
+notes_005 =   srun command was used to launch job using 1 GPU/rank.
+notes_010 =Detailed information from nvaccelinfo
+notes_015 =
+notes_020 =CUDA Driver Version:           11000
+notes_025 =NVRM version:                  NVIDIA UNIX ppc64le Kernel Module  440.64.00  Wed Feb 26 16:01:28 UTC 2020
+notes_030 =
+notes_035 =Device Number:                 0
+notes_040 =Device Name:                   Tesla V100-SXM2-32GB
+notes_045 =Device Revision Number:        7.0
+notes_050 =Global Memory Size:            33822867456
+notes_055 =Number of Multiprocessors:     80
+notes_060 =Concurrent Copy and Execution: Yes
+notes_065 =Total Constant Memory:         65536
+notes_070 =Total Shared Memory per Block: 49152
+notes_075 =Registers per Block:           65536
+notes_080 =Warp Size:                     32
+notes_085 =Maximum Threads per Block:     1024
+notes_090 =Maximum Block Dimensions:      1024, 1024, 64
+notes_095 =Maximum Grid Dimensions:       2147483647 x 65535 x 65535
+notes_100 =Maximum Memory Pitch:          2147483647B
+notes_105 =Texture Alignment:             512B
+notes_110 =Max Clock Rate:                1530 MHz
+notes_115 =Execution Timeout:             No
+notes_120 =Integrated Device:             No
+notes_125 =Can Map Host Memory:           Yes
+notes_130 =Compute Mode:                  default
+notes_135 =Concurrent Kernels:            Yes
+notes_140 =ECC Enabled:                   Yes
+notes_145 =Memory Clock Rate:             877 MHz
+notes_150 =Memory Bus Width:              4096 bits
+notes_155 =L2 Cache Size:                 6291456 bytes
+notes_160 =Max Threads Per SMP:           2048
+notes_165 =Async Engines:                 4
+notes_170 =Unified Addressing:            Yes
+notes_175 =Managed Memory:                Yes
+notes_180 =Concurrent Managed Memory:     Yes
+notes_185 =Preemption Supported:          Yes
+notes_190 =Cooperative Launch:            Yes
+notes_195 =  Multi-Device:                Yes
+notes_200 =Default Target:                cc70
+notes_205 =
+
+#######################################################################
+# End of SUT section
+#######################################################################
+
+#######################################################################
+# The header section of the config file.  Must appear
+# before any instances of "section markers" (see below)
+#
+# ext = how the binaries you generated will be identified
+# tune = specify "base" or "peak" or "all"
+label         = %{label}_%{pmodel}
+tune          = base
+output_format = text
+use_submit_for_speed = 1
+
+# Compiler Settings
+default:
+CC           = mpicc
+CXX          = mpic++
+FC           = mpif90
+# Compiler Version Flags
+CC_VERSION_OPTION  = --version
+CXX_VERSION_OPTION = --version
+FC_VERSION_OPTION  = --version
+
+# MPI options and binding environment, dependent upon Model being run
+# Adjust to match your system
+
+# OpenMP (CPU) Settings
+%if %{pmodel} eq 'omp'
+preENV_OMP_PLACES=cores
+#preENV_OMP_PROC_BIND=true
+#preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,226,27,28,29,30,31,32,33,34,35,36,37,38,39
+#preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,2\
+#3,24
+%endif
+
+#OpenMP Targeting Host Settings
+%if %{pmodel} eq 'tgt'
+#preENV_OMP_PROC_BIND=true
+preENV_MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
+preEnv_MPICH_GPU_SUPPORT_ENABLED=1
+preEnv_MPICH_SMP_SINGLE_COPY_MODE=CMA
+preEnv_MPICH_GPU_EAGER_DEVICE_MEM=0
+#preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,226,27,28,29,30,31,32,33,34,35,36,37,38,39
+#preENV_OMP_PLACES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24
+%endif
+
+%ifdef %{ucx}
+# if using OpenMPI with UCX support, these settings are needed with use of CUDA Aware MPI
+# without these flags, LBM is known to hang when using OpenACC and OpenMP Target to GPUs
+preENV_UCX_MEMTYPE_CACHE=n
+preENV_UCX_TLS=self,shm,cuda_copy
+%endif
+
+#MPIRUN_OPTS = --bind-to none -q
+# 1 GPU per rs, 7 cores per RS, 1 MPI task per RS, 6 RS per host
+submit = srun ${MPIRUN_OPTS} $command
+
+#######################################################################
+# Optimization
+
+# Note that SPEC baseline rules require that all uses of a given compiler
+# use the same flags in the same order. See the SPEChpc Run Rules
+# for more details
+#      http://www.spec.org/hpc2021/Docs/runrules.html
+#
+# OPTIMIZE    = flags applicable to all compilers
+# FOPTIMIZE   = flags appliable to the Fortran compiler
+# COPTIMIZE   = flags appliable to the C compiler
+# CXXOPTIMIZE = flags appliable to the C++ compiler
+#
+# See your compiler manual for information on the flags available
+# for your compiler
+
+# Compiler flags applied to all models
+default=base=default:
+OPTIMIZE     = -O3
+COPTIMIZE     = -lm       # use -mcpu=native for ARM
+CXXOPTIMIZE   = -std=c++11
+#FOPTIMIZE     = -ffree-line-length-none -fno-stack-protector
+FOPTIMIZE     =
+
+%if %{model} eq 'mpi'
+  pmodel=MPI
+%endif
+
+# OpenACC flags
+%if %{pmodel} eq 'acc'
+  # Use with PGI compiler only
+  # https://docs.nvidia.com/hpc-sdk/archive/21.5/
+  pmodel=ACC
+  #OPTIMIZE += -acc=gpu
+  OPTIMIZE += -acc -ta=tesla
+  OPTIMIZE += -acc -ta=tesla -DSPEC_ACCEL_AWARE_MPI #-Minfo=accel
+%endif
+
+# Note that NVHPC is in the process of adding OpenMP array
+# reduction support so this option may be removed in future
+# reduction not supported on taurusml due to old driver
+513.soma_t:
+PORTABILITY+=-DSPEC_NO_VAR_ARRAY_REDUCE
+513.soma_s:
+PORTABILITY+=-DSPEC_NO_VAR_ARRAY_REDUCE
+
+# OpenMP (CPU) flags
+%if %{pmodel} eq 'omp'
+  pmodel=OMP
+  #OPTIMIZE += -qsmp=omp
+  OPTIMIZE += -fopenmp
+  #FOPTIMIZE +=
+%endif
+
+# OpenMP Targeting host flags
+%if %{pmodel} eq 'tgt'
+  pmodel=TGT
+  # PGI
+  OPTIMIZE += -mp -acc=multicore
+  # Intel??
+  # OPTIMIZE += -qsmp=omp -qoffload
+  # -fopen-simd
+  # GCC (doesn't recognize its own flags)
+  #OPTIMIZE += -fopenmp
+  #OPTIMIZE += -fopenmp -mgomp
+  #OPTIMIZE += -fopenmp -msoft-stack -muniform-simt
+  #FOPTIMIZE += -homp
+%endif
+
+# OpenMP Targeting host flags
+%if %{pmodel} eq 'tgtnv'
+  pmodel=TGT
+  # PGI
+  OPTIMIZE += -mp=gpu -acc
+  #FOPTIMIZE += -homp
+%endif
+
+# No peak flags set, so make peak use the same flags as base
+default=peak=default:
+basepeak=1
+
+#######################################################################
+# Portability
+#######################################################################
+
+
+
+# The following section was added automatically, and contains settings that
+# did not appear in the original configuration file, but were added to the
+# raw file after the run.
+default:
+flagsurl000 = http://www.spec.org/hpc2021/flags/nv2021_flags.xml
+interconnect_ib_hw_switch_ib_model000 = Mellanox IB EDR Switch IB-2
--- a/doc.zih.tu-dresden.de/docs/software/spec.md
+++ b/doc.zih.tu-dresden.de/docs/software/spec.md
@@ -53,7 +53,13 @@ The behavior in terms of how to build, run and report the benchmark in a particu
 controlled by a configuration file. There are a few examples included in the source code.
 Here you can apply compiler tuning and porting, specify the runtime environment and describe the
 system under test. SPEChpc2021 has been deployed on the partitions `haswell`, `ml` and
-`alpha`, configurations are available. No matter which one you choose as a starting point,
+`alpha`. Configurations are available, respectively:
+
+- [gnu-taurus.cfg](misc/spec_gnu-taurus.cfg)
+- [nvhpc-ppc.cfg](misc/spec_nvhpc-ppc.cfg)
+- [nvhpc-alpha.cfg](misc/spec_nvhpc-alpha.cfg)
+
+No matter which one you choose as a starting point,
 double-check the line that defines the submit command and make sure it says `srun [...]`, e.g.

 ``` bash