Commit ffa4bd62 authored by Robert Dietrich's avatar Robert Dietrich

added documentation for the prolog and epilog scripts

parent b4c18e44
# SLURM Prolog and Epilog Scripts
Currently, PIKA uses a global prolog and epilog script for each system or partition. The subfolders contain platform dependent prolog and epilog scripts. The prolog and epilog scripts is encapsulated in several parts. The main scripts, prolog.sh and epilog.sh, are structured as follows.
## prolog.sh
1. Get PIKA environment variables <br>
--> [pika-current.conf](../../../pika-current.conf) <br>
&emsp;&emsp;--> [pika.conf](../../../pika.conf)
2. Check for active jobs and make this job visible to other prologs
3. Setup debugging
4. Install the PIKA package <br>
--> [pika_install_prolog_include.sh](pika_install_prolog_include.sh)
5. Check/setup logrotate & wait until PIKA python is available
6. Determine master node
7. Set default values for the SLURM environment (in case Redis database is down) <br>
--> [pika_slurm_env.sh](pika_slurm_env.sh)
8. Get job metadata from Redis database <br>
--> [pika_get_metadata_prolog_include.sh](pika_get_metadata_prolog_include.sh) <br>
&emsp;&emsp;--> [pika_slurm_env_redis_new.py](pika_slurm_env_redis_new.py)
9. Start/stop collectd <br>
--> [pika_collectd_prolog_include.sh](pika_collectd_prolog_include.sh) <br>
&emsp;&emsp;--> [pika_start_collectd.sh](pika_start_collectd.sh)
10. Send job metadata to MariaDB <br>
--> [pika_save_metadata_prolog_include.sh](pika_save_metadata_prolog_include.sh) (uses [pika_utils.sh](pika_utils.sh))
## epilog.sh
1. Get PIKA environment variables <br>
--> [pika-current.conf](../../../pika-current.conf) <br>
&emsp;&emsp;--> [pika.conf](../../../pika.conf)
2. Check if prolog was called and debug file is available
3. Read SLURM environment from file (created in during prolog)
4. Determine master node
5. Update job metadata <br>
--> [pika_update_metadata_epilog_include.sh](pika_update_metadata_epilog_include.sh) (uses [pika_utils.sh](pika_utils.sh))
6. Set LIKWID counters if LIKWID is used with direct access mode
7. Cleanup local data
......@@ -4,23 +4,24 @@
# exit 0
#fi
if [[ $HOSTNAME = taurusi7* ]]; then
exit 0
fi
#if [[ $HOSTNAME = taurusi7* ]] && [ "$SLURM_JOB_USER" != "rdietric" ]; then
# exit 0
#fi
# (1) Get PIKA environment variables
source /sw/taurus/tools/pika/pika-current.conf
export PIKA_JOB_ID=${SLURM_JOB_ID}
export PIKA_JOB_NODELIST=${SLURM_NODELIST}
# check if prolog script was called
# (2.1) Check if prolog script was called
if [ -e ${LOCAL_STORE}/pika_prolog_${PIKA_JOB_ID} ]; then
rm -f ${LOCAL_STORE}/pika_prolog_${PIKA_JOB_ID}
else
exit 0
fi
# check for debug file
# (2.2) Check for debug file
if [ "${PIKA_DEBUG}" == "1" ]; then
mkdir -p /tmp/pika_debug
DEBUG_PATH=/tmp/pika_debug/pika_${PIKA_JOB_ID}
......@@ -29,10 +30,10 @@ else
DEBUG_PATH=/dev/null
fi
# file that contains job metadata
# (3) Read SLURM environment from file (created in during prolog)
BATCHSYSTEM_ENV_FILE=${LOCAL_STORE}/pika_batchsystem_env_${PIKA_JOB_ID}
# determine master node
# (4) Determine master node
if [ -x "$(command -v ${PYTHON_ROOT}/bin/nodeset)" ]; then
MASTER_NODE=`echo ${PIKA_JOB_NODELIST} | ${PYTHON_ROOT}/bin/nodeset -e | cut -d ' ' -f 1 | cut -d. -f1`
else
......@@ -58,15 +59,18 @@ echo -e "\nMASTER_NODE=$MASTER_NODE" >> $DEBUG_PATH 2>&1
PIKA_HOSTNAME=$(hostname | cut -d. -f1)
echo "PIKA_HOSTNAME=$PIKA_HOSTNAME" >> $DEBUG_PATH 2>&1
# update job metadata
# (5) Update job metadata
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_update_metadata_epilog_include.sh >> $DEBUG_PATH 2>&1
# Reset counters for exclusive jobs AND monitoring enabled (need for LIKWID direct MSR access)
#if [ $PIKA_MONITORING -eq 1 ] && [ $PIKA_JOB_EXCLUSIVE -eq 1 ]; then
# echo "PUTNOTIF severity=okay time=$(date +%s) plugin=likwid message=rstCtrs" | nc -U ${PIKA_COLLECTD_SOCKET}
#fi
# (6) Set LIKWID counters if LIKWID is used with direct access mode
if [ "${PIKA_LIKWID_MODE}" = "direct" ]; then
# Reset counters for exclusive jobs AND monitoring enabled
if [ $PIKA_MONITORING -eq 1 ] && [ $PIKA_JOB_EXCLUSIVE -eq 1 ]; then
echo "PUTNOTIF severity=okay time=$(date +%s) plugin=likwid message=rstCtrs" | nc -U ${PIKA_COLLECTD_SOCKET}
fi
fi
# cleanup local data
# (7) Cleanup local data
rm -f ${BATCHSYSTEM_ENV_FILE}
echo -e "\nEpilog finished sucessfully!" >> $DEBUG_PATH 2>&1
......
......@@ -4,13 +4,15 @@
# exit 0
#fi
if [[ $HOSTNAME = taurusi7* ]]; then
exit 0
fi
#if [[ $HOSTNAME = taurusi7* ]] && [ "$SLURM_JOB_USER" != "rdietric" ]; then
# exit 0
#fi
####################################
# (1) Get PIKA environment variables
source /sw/taurus/tools/pika/pika-current.conf
##### (1) if SLURM_JOB_ID is of length zero
# if SLURM_JOB_ID is of length zero
if [[ -z "${SLURM_JOB_ID}" ]]; then
mkdir -p /tmp/pika_debug
echo -e "\n SLURM_JOB_ID is not available. Exit prolog" > /tmp/pika_debug/slurm_job_id_not_available 2>&1
......@@ -20,15 +22,15 @@ else
export PIKA_JOB_ID=${SLURM_JOB_ID}
fi
##### (2) check for active jobs and make job visible to other prologs #####
####################################
# (2) Check for active jobs and make this job visible to other prologs
# number of local running jobs
LOCAL_JOBS_RUNNING=`ls -l ${LOCAL_STORE} | grep -c pika_prolog_`
# create prolog file (which is removed in epilog)
touch ${LOCAL_STORE}/pika_prolog_${PIKA_JOB_ID}
##### (3) Generate debug file
####################################
# (3) Setup debugging
if [ ${PIKA_DEBUG} -eq 1 ]; then
# delete debug files older than 7 days
find /tmp/pika_debug/pika_* -mtime +7 -exec rm {} \;
......@@ -42,14 +44,14 @@ else
DEBUG_PATH=/dev/null
fi
#### Developer Debugging ####
# developer debugging
if [ "$SLURM_JOB_USER" = "rdietric" ]; then
echo -e "\n### $SLURM_JOB_USER ###" >> $DEBUG_PATH 2>&1
export LD_LIBRARY_PATH=${PIKA_BUILD_PATH}/likwid/${LIKWID_VERSION}/lib:$LD_LIBRARY_PATH
/sw/taurus/tools/pika/daemon/collectd/collectd-plugins/c/topo >> $DEBUG_PATH 2>&1
echo "### End $SLURM_JOB_USER ###" >> $DEBUG_PATH 2>&1
fi
#############################
# END: developer debugging
# print date
date >> $DEBUG_PATH 2>&1
......@@ -60,7 +62,7 @@ if ! [ -d "${LOCAL_STORE}" ]; then
echo -e "\nError:LOCAL_STORE=${LOCAL_STORE} does not exist." >> $DEBUG_PATH 2>&1
fi
##### (4) if SLURM_NODELIST is of length zero
# if SLURM_NODELIST is of length zero, exit prolog
if [[ -z "${SLURM_NODELIST}" ]]; then
echo -e "\n SLURM_NODELIST is not available. Exit prolog" >> $DEBUG_PATH 2>&1
exit 0
......@@ -68,22 +70,22 @@ else
export PIKA_JOB_NODELIST=${SLURM_NODELIST}
fi
##### (5) pika package installation
# used for install and collectd start
####################################
# (4) Install the PIKA package
# lock is used for install and collectd start
lock_collectd=${LOCAL_STORE}/pika_collectd_setup.lock
have_setup_lock=false
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_install_prolog_include.sh >> $DEBUG_PATH 2>&1
##### (6) pika presetup
# setup logrotate
####################################
# (5.1) Check/setup logrotate
# check if pika_lograte.sh is in /etc/cron.daily
if [ ! -f "/etc/cron.daily/pika_logrotate.sh" ]; then
echo -e "\nSetup logrotate" >> $DEBUG_PATH 2>&1
cp ${PIKA_ROOT}/daemon/logrotate/pika_logrotate.sh /etc/cron.daily >> $DEBUG_PATH 2>&1
fi
# check for Python installation
# (5.2) Wait until PIKA python is available
echo -e "\nCheck PIKA python3:" >> $DEBUG_PATH 2>&1
pika_python_bin=${PYTHON_ROOT}/bin/python3
if [ -x "$(command -v ${pika_python_bin})" ]; then
......@@ -99,7 +101,8 @@ else
done
fi
# determine master node
####################################
# (6) Determine master node
if [ -x "$(command -v ${PYTHON_ROOT}/bin/nodeset)" ]; then
MASTER_NODE=`echo ${PIKA_JOB_NODELIST} | ${PYTHON_ROOT}/bin/nodeset -e | cut -d ' ' -f 1 | cut -d. -f1`
else
......@@ -125,10 +128,12 @@ echo "PIKA_HOSTNAME=$PIKA_HOSTNAME" >> $DEBUG_PATH 2>&1
# file where job information is stored (can be sourced later)
BATCHSYSTEM_ENV_FILE=${LOCAL_STORE}/pika_batchsystem_env_${PIKA_JOB_ID}
# set defaults for all pika metadata provided by SLURM
####################################
# (7) Set default values for the SLURM environment (in case Redis database is down)
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_slurm_env.sh >> $DEBUG_PATH 2>&1
##### (7) get additional job metadata from redis
####################################
# (8) Get job metadata from Redis database
PIKA_MONITORING=1
if [ -x "$(command -v ${pika_python_bin})" ]; then
which python3 >> $DEBUG_PATH 2>&1
......@@ -137,15 +142,11 @@ else
echo -e "Error: PIKA python3 is NOT available!" >> $DEBUG_PATH 2>&1
fi
##### (8) based on the PIKA_MONITORING value, start or stop collectd
####################################
# (9) Start/stop collectd (based on the PIKA_MONITORING value)
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_collectd_prolog_include.sh >> $DEBUG_PATH 2>&1
#/opt/pika/1.0/likwid/5.0.1/bin/likwid-topology >> $DEBUG_PATH 2>&1
#export OMP_NUM_THREADS=176
#export HOME="workaroundLIKWIDbug"
#/opt/pika/1.0/likwid/5.0.1/bin/likwid-perfctr -V 3 -g pika_metrics_1 ~rdietric/examples/bash_scripts/laplace2d-taurusml >> $DEBUG_PATH 2>&1
##### (9) save job metadata
# (10) Send job metadata to MariaDB
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_save_metadata_prolog_include.sh >> $DEBUG_PATH 2>&1
echo -e "\nProlog finished successfully!" >> $DEBUG_PATH 2>&1
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment