Commit 20cbee31 authored by Robert Dietrich's avatar Robert Dietrich

a couple of fixes in prolog and epilog

parent 784ac173
......@@ -25,10 +25,10 @@ fi
BATCHSYSTEM_ENV_FILE=${LOCAL_STORE}/pika_batchsystem_env_${PIKA_JOB_ID}
# determine master node
MASTER_NODE=`echo ${PIKA_JOB_NODELIST} | nodeset -e | cut -d ' ' -f 1`
MASTER_NODE=`echo ${PIKA_JOB_NODELIST} | nodeset -e | cut -d ' ' -f 1 | cut -d. -f1`
# this node's name
PIKA_HOSTNAME=$(hostname)
PIKA_HOSTNAME=$(hostname | cut -d. -f1)
# update job metadata
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_update_metadata_epilog_include.sh >> $DEBUG_PATH 2>&1
......
......@@ -14,7 +14,7 @@ if [ ${LOCAL_JOBS_RUNNING} -eq 0 ]; then
fi
# we stop collectd if:
# user wants to disable monitoring or
# user wants to disable monitoring OR
# we do not get the no_monitoring flag, but there is no other job running on the local node
if [[ $PIKA_MONITORING -eq 0 || ($PIKA_MONITORING -eq -1 && ${LOCAL_JOBS_RUNNING} -eq 0) ]]; then
echo -e "\nExclusive and no_monitoring set! Stop collectd." >> $DEBUG_PATH 2>&1
......@@ -24,27 +24,30 @@ if [[ $PIKA_MONITORING -eq 0 || ($PIKA_MONITORING -eq -1 && ${LOCAL_JOBS_RUNNING
# check if COLLECTD_PID is empty
if [ -z "$COLLECTD_PID" ]; then
echo -e "\ncollectd is not running." >> $DEBUG_PATH 2>&1
echo -e "\ncollectd is not running." >> $DEBUG_PATH 2>&1
else
echo -e "\nkill -TERM $COLLECTD_PID" >> $DEBUG_PATH 2>&1
kill -TERM $COLLECTD_PID
echo -e "\nkill -TERM $COLLECTD_PID" >> $DEBUG_PATH 2>&1
kill -TERM $COLLECTD_PID
fi
fi
# determine number of PIKA collectd processes
active_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
echo -e "\nActive PIKA collectd processes: ${active_procs}" >> $DEBUG_PATH 2>&1
# if we have more than one collectd process running, kill all but the last
if [ $active_procs -gt 1 ]; then
COLLECTD_PIDS=`ps -eo pid,cmd | grep -v grep | grep "pika_collectd" | awk '{print $1}'`
KILL_COLLECTD_PIDS=`echo $COLLECTD_PID | sed s/'\w*$'//` #tail of last word/pid in line
echo -e "\nMore than one PIKA collectd active: ${COLLECTD_PIDS}. kill -TERM ${KILL_COLLECTD_PIDS}" >> $DEBUG_PATH 2>&1
kill -TERM ${KILL_COLLECTD_PIDS} >> $DEBUG_PATH 2>&1
fi
# we only check for collectd if $PIKA_MONITORING=1
if [[ $PIKA_MONITORING -eq 1 ]]; then
# determine number of PIKA collectd processes
active_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
echo -e "\nActive collectd processes: ${active_procs}" >> $DEBUG_PATH 2>&1
# if collectd is not running yet
if [ $active_procs -eq 0 ]; then
echo -e "\nReset lustre counter" >> $DEBUG_PATH 2>&1
for fs in /proc/fs/lustre/llite/*; do
llstat -c $fs/stats
done
CD_INST_PATH=$PIKA_INSTALL_PATH/collectd/$COLLECTD_VERSION
CUSTOM_TYPES_DIR=$PIKA_INSTALL_PATH/collectd/$COLLECTD_VERSION/share/collectd
......@@ -64,6 +67,18 @@ if [[ $PIKA_MONITORING -eq 1 ]]; then
# set python module path
sed -i "/ModulePath/c \ \ ModulePath \"${COLLECTD_PYTHON_PLUGIN_PATH}\"" ${COLLECTD_CONF}
# Check for lustre and, if available reset counters
LUSTRE_PATH=/proc/fs/lustre
if [ -d "${LUSTRE_PATH}" ]; then
echo -e "\nLustre is available. Reset Lustre counters." >> $DEBUG_PATH 2>&1
for fs in ${LUSTRE_PATH}/llite/*; do
llstat -c $fs/stats
done
else
echo -e "Disable Lustre plugin. ${LUSTRE_PATH} not found." >> $DEBUG_PATH 2>&1
sed -i "/Import \"lustre_bw\"/,/<\/Module>/"' s/^/#/' "${COLLECTD_CONF}"
fi
# set InfluxDB access settings
sed -i "/#INFLUXHOST/c \ \ \ \ host \"${INFLUXDB_HOST}\"" ${COLLECTD_CONF}
......@@ -114,7 +129,7 @@ if [[ $PIKA_MONITORING -eq 1 ]]; then
#expose path to likwid library (needed to load collectd likwid plugin)
export LD_LIBRARY_PATH=${LIKWID_INST_PATH}/lib:${LD_LIBRARY_PATH}
# set the socket file from PIKO configuration
# set the socket file from PIKA configuration
sed -i '/<Plugin unixsock>/,/Plugin>/'" s|SocketFile.*|SocketFile \"${PIKA_COLLECTD_SOCKET}\"|" ${COLLECTD_CONF}
else
echo -e "Error: No PIKA group definitions for LIKWID found! Disable LIKWID and unixsock plugins." >> $DEBUG_PATH 2>&1
......@@ -131,18 +146,21 @@ if [[ $PIKA_MONITORING -eq 1 ]]; then
export HOME="workaroundLIKWIDbug"
#echo -e `ldd /opt/pika/2.5/collectd/5.10.0/lib/collectd/likwid.so` >> $DEBUG_PATH 2>&1
#echo -e "/sw/taurus/tools/pika/collectd/collectd-plugins/c/test_likwid -v3 -g$group_string" >> $DEBUG_PATH 2>&1
#/sw/taurus/tools/pika/collectd/collectd-plugins/c/test_likwid -v3 -g$group_string >> $DEBUG_PATH 2>&1
echo -e "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> $DEBUG_PATH 2>&1
$DAEMON >> $DEBUG_PATH 2>&1
# if no PID file was created the daemon could not start
#if [ ! -f ${COLLECTD_PID_FILE} ]; then
# echo -e "Error: Collectd could not be started.">> $DEBUG_PATH 2>&1
#fi
# check if collectd is up and running
active_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
echo -e "\nDaemon started? Active collectd processes: ${active_procs}" >> $DEBUG_PATH 2>&1
if [ $active_procs -eq 0 ]; then
sleep 1
active_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
echo -e "\nDaemon started? Active collectd processes: ${active_procs}" >> $DEBUG_PATH 2>&1
if [ $active_procs -eq 0 ]; then
echo -e "\nError: Collectd could not be started!" >> $DEBUG_PATH 2>&1
fi
fi
fi
fi
#!/bin/bash
if [[ ${PIKA_HOSTNAME} = *"${MASTER_NODE}"* ]]; then
if [ ${PIKA_HOSTNAME} = ${MASTER_NODE} ]; then
# get utility functions
source ${PIKA_ROOT}/pika_utils.sh >> $DEBUG_PATH 2>&1
......@@ -52,6 +52,14 @@ if [[ ${PIKA_HOSTNAME} = *"${MASTER_NODE}"* ]]; then
JOB_ARRAY_ID=${PIKA_JOB_ARRAY_ID}
fi
# check if job name is too long (maximum length is 256)
chrlen=${#PIKA_JOB_NAME}
if [ $chrlen -gt 252 ]; then
echo "Length of job name: $chrlen" >> $DEBUG_PATH 2>&1
PIKA_JOB_NAME="${PIKA_JOB_NAME:0:252}..."
fi
# convert walltime from minutes to seconds!!!
# if [[ ! -z "${PIKA_JOB_WALLTIME}" ]]; then
# PIKA_JOB_WALLTIME=$((PIKA_JOB_WALLTIME * 60))
......@@ -70,4 +78,4 @@ if [[ ${PIKA_HOSTNAME} = *"${MASTER_NODE}"* ]]; then
mysql_command "${SQL_QUERY}"
fi
fi
\ No newline at end of file
fi
......@@ -48,6 +48,7 @@ def main(job_id, debug_path, env_file, force):
if slurm_env_string and len(str(slurm_env_string)) > 0:
try:
slurm_env = cPickle.loads(slurm_env_string)
#print(slurm_env)
except:
slurm_env = {}
......@@ -115,7 +116,12 @@ def save_job_env(env_file, slurm_env, connection, debug_file):
partition_data_string = connection.get(partition_name)
partition_data = cPickle.loads(partition_data_string)
#partition_data = ast.literal_eval(str(partition_data_string))
cpus_avail = int(partition_data['max_cpus_per_node'])
try:
cpus_avail = int(partition_data['max_cpus_per_node'])
#print(str(partition_data))
except:
cpus_avail = -1
#print(str(partition_data))
if (total_cpus_allocated / node_count) == cpus_avail:
#print "Exclusive with " + str(total_cpus_allocated / node_count) + " cpus per node on partition " + partition_name
......
#!/bin/bash
if [[ ${PIKA_HOSTNAME} = *"${MASTER_NODE}"* ]]; then
if [ ${PIKA_HOSTNAME} = ${MASTER_NODE} ]; then
# get utility functions
source ${PIKA_ROOT}/pika_utils.sh >> $DEBUG_PATH 2>&1
......
......@@ -34,6 +34,9 @@ else
DEBUG_PATH=/dev/null
fi
# print date
date >> $DEBUG_PATH 2>&1
##### (4) if SLURM_NODELIST is of length zero
if [[ -z "${SLURM_NODELIST}" ]]; then
......@@ -50,7 +53,7 @@ if [ ! -d "${PIKA_INSTALL_PATH}" ]; then
echo -e "\nInstall/Replace PIKA software stack to ${PIKA_INSTALL_PATH}" >> $DEBUG_PATH 2>&1
# check if an old collectd daemon is still running, if so kill it
echo -e "\nCheck if an old pika collectd is still running" >> $DEBUG_PATH 2>&1
echo -e "\nCheck if an old PIKA collectd is still running" >> $DEBUG_PATH 2>&1
DAEMON="pika_collectd"
COLLECTD_PID=`ps -eo pid,cmd | grep -v grep | grep "$DAEMON" | awk '{print $1}'`
echo -e "\nCOLLECTD_PID=$COLLECTD_PID" >> $DEBUG_PATH 2>&1
......@@ -58,8 +61,17 @@ if [ ! -d "${PIKA_INSTALL_PATH}" ]; then
if [ -z "$COLLECTD_PID" ]; then
echo -e "\ncollectd is not running." >> $DEBUG_PATH 2>&1
else
old_pika_collectd_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
echo -e "\nNumber of active old PIKA Collectd processes: ${old_pika_collectd_procs}. Try to terminate them." >> $DEBUG_PATH 2>&1
kill -TERM $COLLECTD_PID
sleep 1
old_pika_collectd_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
if [ $old_pika_collectd_procs -gt 0 ]; then
echo -e "\nkill -KILL $COLLECTD_PID" >> $DEBUG_PATH 2>&1
kill -KILL $COLLECTD_PID
kill -KILL $COLLECTD_PID >> $DEBUG_PATH 2>&1
fi
old_pika_collectd_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
echo -e "\nError: Could not terminate old PIKA Collectd processes. ${old_pika_collectd_procs} are still running." >> $DEBUG_PATH 2>&1
fi
# delete old installation if it is still there
......@@ -87,10 +99,10 @@ echo -e "\nCheck python3 path:" >> $DEBUG_PATH 2>&1
which python3 >> $DEBUG_PATH 2>&1
# determine master node
MASTER_NODE=`echo ${PIKA_JOB_NODELIST} | nodeset -e | cut -d ' ' -f 1`
MASTER_NODE=`echo ${PIKA_JOB_NODELIST} | nodeset -e | cut -d ' ' -f 1 | cut -d. -f1`
# this node's name
PIKA_HOSTNAME=$(hostname)
PIKA_HOSTNAME=$(hostname | cut -d. -f1)
echo -e "\nMASTER_NODE=$MASTER_NODE" >> $DEBUG_PATH 2>&1
echo -e "\nPIKA_HOSTNAME=$PIKA_HOSTNAME" >> $DEBUG_PATH 2>&1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment