Commit 2983cecd authored by Robert Dietrich's avatar Robert Dietrich

added patch for counter resetting with Likwid 4.3.3

parent cdfb4a60
......@@ -28,4 +28,5 @@ mem_bw (MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+
LONG
--
-
mem_bw (MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
diff -ruN src.save/includes/likwid.h src/includes/likwid.h
--- src.save/includes/likwid.h 2019-03-27 15:48:05.164171000 +0100
+++ src/includes/likwid.h 2019-03-27 15:50:17.917221000 +0100
@@ -740,6 +740,14 @@
@return Returns the ID of the new eventSet
*/
extern int perfmon_addEventSet(const char* eventCString) __attribute__ ((visibility ("default") ));
+/*! \brief Set configuration of all performance monitoring counters of an eventSet
+
+A event string looks like Eventname:Countername(:Option1:Option2:...),...
+The eventname, countername and options are checked if they are available.
+@param [in] groupId (returned from perfmon_addEventSet()
+@return error code (-ENOENT if groupId is invalid and -1 if the counters of one CPU cannot be set up)
+*/
+extern int perfmon_setCountersConfig(int groupId) __attribute__ ((visibility ("default") ));
/*! \brief Setup all performance monitoring counters of an eventSet
A event string looks like Eventname:Countername(:Option1:Option2:...),...
diff -ruN src.save/perfmon.c src/perfmon.c
--- src.save/perfmon.c 2019-03-27 15:48:05.848159000 +0100
+++ src/perfmon.c 2019-03-27 15:52:48.778104000 +0100
@@ -1963,6 +1963,39 @@
}
int
+perfmon_setCountersConfig(int groupId)
+{
+ int i;
+ int ret = 0;
+ if (!lock_check())
+ {
+ ERROR_PLAIN_PRINT(Access to performance monitoring registers locked);
+ return -ENOLCK;
+ }
+ if (perfmon_initialized != 1)
+ {
+ ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+ return -EINVAL;
+ }
+ if (unlikely(groupSet == NULL))
+ {
+ return -EINVAL;
+ }
+
+ if (groupId >= groupSet->numberOfActiveGroups)
+ {
+ ERROR_PRINT(Group %d does not exist in groupSet, groupId);
+ return -ENOENT;
+ }
+
+ for(i=0;i<groupSet->numberOfThreads;i++)
+ {
+ memset(currentConfig[groupSet->threads[i].processorId], 0, NUM_PMC * sizeof(uint64_t));
+ }
+ return 0;
+}
+
+int
perfmon_setupCounters(int groupId)
{
int i;
#!/bin/bash
# build for a specific version of PIKA
export PIKA_ROOT=/sw/taurus/tools/pika
source ${PIKA_ROOT}/pika-1.1.conf
export PIKA_PATCHES=${PIKA_ROOT}/install/compute_node/patches
if [[ $(hostname -s) = taurusml* ]]; then
export PIKA_TARGET=power
export CUDA_PATH=/usr/local/cuda-9.2
else
export CUDA_PATH=/sw/installed/CUDA/10.1.243
fi
pika_install-0.9.conf
\ No newline at end of file
......@@ -61,10 +61,15 @@ if [ ${PIKA_HOSTNAME} = ${MASTER_NODE} ]; then
PIKA_JOB_NAME="${PIKA_JOB_NAME:0:252}..."
fi
# convert walltime from minutes to seconds!!!
# if [[ ! -z "${PIKA_JOB_WALLTIME}" ]]; then
# PIKA_JOB_WALLTIME=$((PIKA_JOB_WALLTIME * 60))
# fi
# convert walltime from minutes to seconds
if [[ ! -z "${PIKA_JOB_WALLTIME}" ]]; then
PIKA_JOB_WALLTIME=$((PIKA_JOB_WALLTIME * 60))
fi
# save all nodes for exclusive jobs (node number > 1) in cpu list in order to search jobs by a specific node
if [ $PIKA_JOB_EXCLUSIVE -eq 1 ] && [ $JOB_NUM_NODES -gt 1 ]; then
PIKA_JOB_CPUS_ALLOCATED=`echo ${PIKA_JOB_NODELIST} | nodeset -e`
fi
# create sql statement
SQL_QUERY="INSERT INTO Job_Data "
......
......@@ -6,7 +6,7 @@ if [ ${PIKA_HOSTNAME} = ${MASTER_NODE} ]; then
source ${PIKA_ROOT}/pika_utils.sh >> $DEBUG_PATH 2>&1
JOB_STATUS="completed"
ERROR_CODE=0
PROPERTY_ID=0
# get start time from prolog
LOCAL_TIME_STORE=${LOCAL_STORE}/pika_local_time_${PIKA_JOB_ID}
......@@ -49,7 +49,7 @@ if [ ${PIKA_HOSTNAME} = ${MASTER_NODE} ]; then
echo -e "\nJOB_DURATION=${JOB_DURATION}" >> $DEBUG_PATH 2>&1
# check if job run into timeout
if [ "${JOB_DURATION}" -gt "$((PIKA_JOB_WALLTIME * 60))" ]; then
if [ "${JOB_DURATION}" -gt "${PIKA_JOB_WALLTIME}" ]; then
if [ "${PIKA_JOB_WALLTIME}" == "0" ]; then
JOB_STATUS="completed"
else
......@@ -57,30 +57,30 @@ if [ ${PIKA_HOSTNAME} = ${MASTER_NODE} ]; then
fi
fi
# determine Error code TODO: rename to Properties
# determine property ID
if [[ $PIKA_MONITORING -eq -1 ]]; then
# slurm data incomplete
ERROR_CODE=$((ERROR_CODE+1))
PROPERTY_ID=$((PROPERTY_ID+1))
# check if job is exclusive, if so, collectd monitoring is disabled as well
LOCAL_JOBS_RUNNING=`ls -l ${LOCAL_STORE} | grep -c pika_prolog_`
echo -e "\nLOCAL_JOBS_RUNNING=${LOCAL_JOBS_RUNNING}" >> $DEBUG_PATH 2>&1
if [[ ${LOCAL_JOBS_RUNNING} -eq 0 ]]; then
ERROR_CODE=$((ERROR_CODE+4))
PROPERTY_ID=$((PROPERTY_ID+4))
fi
fi
if [[ $PIKA_MONITORING -eq 0 ]]; then
# collectd monitoring is disabled
ERROR_CODE=$((ERROR_CODE+4))
PROPERTY_ID=$((PROPERTY_ID+4))
fi
echo -e "\nERROR_CODE=${ERROR_CODE}" >> $DEBUG_PATH 2>&1
echo -e "\nPROPERTY_ID=${PROPERTY_ID}" >> $DEBUG_PATH 2>&1
# update job data in mariadb
if [ "${JOB_DURATION}" -lt "60" ]; then
SQL_QUERY="DELETE FROM Job_Data WHERE JID=${PIKA_JOB_ID}"
else
SQL_QUERY="UPDATE Job_Data SET STATUS='${JOB_STATUS}',END='${JOB_END}',ERROR_CODE='${ERROR_CODE}' "
SQL_QUERY="UPDATE Job_Data SET STATUS='${JOB_STATUS}',END='${JOB_END}',PROPERTY_ID='${PROPERTY_ID}' "
SQL_QUERY+="WHERE JID=${PIKA_JOB_ID} AND START=${JOB_START}"
fi
......
#!/bin/bash
source /sw/taurus/tools/pika/pika-current.conf
export PIKA_JOB_ID=${SLURM_JOB_ID}
export PIKA_JOB_NODELIST=${SLURM_NODELIST}
# check if prolog script was called
if [ -e ${LOCAL_STORE}/pika_prolog_${PIKA_JOB_ID} ]; then
rm -f ${LOCAL_STORE}/pika_prolog_${PIKA_JOB_ID}
else
exit 0
fi
# check for debug file
if [ "${PIKA_DEBUG}" == "1" ]; then
mkdir -p /tmp/pika_debug
DEBUG_PATH=/tmp/pika_debug/pika_${PIKA_JOB_ID}
echo -e "\nStart epilog debugging..." >> $DEBUG_PATH 2>&1
else
DEBUG_PATH=/dev/null
fi
# file that contains job metadata
BATCHSYSTEM_ENV_FILE=${LOCAL_STORE}/pika_batchsystem_env_${PIKA_JOB_ID}
# determine master node
MASTER_NODE=`echo ${PIKA_JOB_NODELIST} | nodeset -e | cut -d ' ' -f 1`
# this node's name
PIKA_HOSTNAME=$(hostname)
# update job metadata
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_update_metadata_epilog_include.sh >> $DEBUG_PATH 2>&1
# cleanup local data
rm -f ${BATCHSYSTEM_ENV_FILE}
echo -e "\nEpilog finished sucessfully!" >> $DEBUG_PATH 2>&1
exit 0
../epilog.sh
\ No newline at end of file
#!/bin/bash
source /sw/taurus/tools/pika/pika-current.conf
##### (1) if SLURM_JOB_ID is of length zero
if [[ -z "${SLURM_JOB_ID}" ]]; then
mkdir -p /tmp/pika_debug
echo -e "\n SLURM_JOB_ID is not available. Exit prolog" > /tmp/pika_debug/slurm_job_id_not_available 2>&1
env | grep SLURM >> /tmp/pika_debug/slurm_job_id_not_available 2>&1
exit 0
else
export PIKA_JOB_ID=${SLURM_JOB_ID}
fi
##### (2) check for active jobs and make job visible to other prologs #####
# number of local running jobs
LOCAL_JOBS_RUNNING=`ls -l ${LOCAL_STORE} | grep -c pika_prolog_`
# create prolog file (which is removed in epilog)
touch ${LOCAL_STORE}/pika_prolog_${PIKA_JOB_ID}
##### (3) Generate debug file
if [ ${PIKA_DEBUG} -eq 1 ]; then
# delete debug files older than 7 days
find /tmp/pika_debug/pika_* -mtime +7 -exec rm {} \;
find /tmp/pika_debug/memcache_* -mtime +7 -exec rm {} \;
mkdir -p /tmp/pika_debug
DEBUG_PATH=/tmp/pika_debug/pika_${PIKA_JOB_ID}
echo -e "Start prolog debugging..." > $DEBUG_PATH 2>&1
chmod o+r $DEBUG_PATH
else
DEBUG_PATH=/dev/null
fi
##### (4) if SLURM_NODELIST is of length zero
if [[ -z "${SLURM_NODELIST}" ]]; then
echo -e "\n SLURM_NODELIST is not available. Exit prolog" >> $DEBUG_PATH 2>&1
exit 0
else
export PIKA_JOB_NODELIST=${SLURM_NODELIST}
fi
##### (5) pika package installation
# install pika python and likwid in /opt/pika if it is not already there
if [ ! -d "${PIKA_INSTALL_PATH}" ]; then
echo -e "\nUnpack PIKA software stack to ${PIKA_INSTALL_PATH}" >> $DEBUG_PATH 2>&1
# delete old installation if it is still there
if [ -d "/opt/pika" ]; then
rm -rf /opt/pika
fi
mkdir -p ${PIKA_INSTALL_PATH}
echo -e "tar xzf ${PIKA_PACKAGE_PATH} -C ${PIKA_INSTALL_PATH}/.." >> $DEBUG_PATH 2>&1
tar xzf ${PIKA_PACKAGE_PATH} -C ${PIKA_INSTALL_PATH}/.. >> $DEBUG_PATH 2>&1
fi
##### (6) pika presetup
# setup logrotate
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_logrotate_prolog_include.sh >> $DEBUG_PATH 2>&1
# check python
echo -e "\nCheck python3 path:" >> $DEBUG_PATH 2>&1
which python3 >> $DEBUG_PATH 2>&1
# determine master node
MASTER_NODE=`echo ${PIKA_JOB_NODELIST} | nodeset -e | cut -d ' ' -f 1`
# this node's name
PIKA_HOSTNAME=$(hostname)
echo -e "\nMASTER_NODE=$MASTER_NODE" >> $DEBUG_PATH 2>&1
echo -e "\nPIKA_HOSTNAME=$PIKA_HOSTNAME" >> $DEBUG_PATH 2>&1
# file where job information is stored (can be sourced later)
BATCHSYSTEM_ENV_FILE=${LOCAL_STORE}/pika_batchsystem_env_${PIKA_JOB_ID}
# set defaults for all pika metadata provided by SLURM
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_slurm_env.sh >> $DEBUG_PATH 2>&1
##### (7) get additional job metadata from redis
PIKA_MONITORING=1
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_get_metadata_prolog_include.sh >> $DEBUG_PATH 2>&1
##### (8) based on the PIKA_MONITORING value, start or stop collectd
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_collectd_prolog_include.sh >> $DEBUG_PATH 2>&1
##### (9) save job metadata
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_save_metadata_prolog_include.sh >> $DEBUG_PATH 2>&1
echo -e "\nProlog finished sucessfully!" >> $DEBUG_PATH 2>&1
exit 0
\ No newline at end of file
../prolog.sh
\ No newline at end of file
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment