diff --git a/NEWS b/NEWS index 1505b835408573367bc7f5907305929d3818ea70..57c3b63fc0412c47bf289c6d5f527970225687c6 100644 --- a/NEWS +++ b/NEWS @@ -156,6 +156,9 @@ documents those changes that are of interest to users and admins. requirements (from Miguel Roa, BSC). -- Permit a user to change a pending job's TasksPerNode specification using scontrol (from Miguel Roa, BSC). + -- Add support for node UP/DOWN event logging in jobacct/gold plugin + WARNING: using the jobacct/gold plugin slows the system startup set the + MessageTimeout variable in the slurm.conf to around 20+. * Changes in SLURM 1.2.21 ========================= diff --git a/configure b/configure index 69971f19a26647464ac76fa21490216337806d9b..a0c291f30122161af708daf55e29f574b5e83de0 100755 --- a/configure +++ b/configure @@ -25117,221 +25117,10 @@ echo "$as_me: WARNING: *** pg_config not found. Evidently no PostgreSQL install save_CFLAGS="$CFLAGS" CFLAGS="$PGSQL_CFLAGS $save_CFLAGS" - -for ac_header in $PGSQL_INCLUDEDIR/libpq-fe.h -do -as_ac_Header=`echo "ac_cv_header_$ac_header" | $as_tr_sh` -if { as_var=$as_ac_Header; eval "test \"\${$as_var+set}\" = set"; }; then - { echo "$as_me:$LINENO: checking for $ac_header" >&5 -echo $ECHO_N "checking for $ac_header... $ECHO_C" >&6; } -if { as_var=$as_ac_Header; eval "test \"\${$as_var+set}\" = set"; }; then - echo $ECHO_N "(cached) $ECHO_C" >&6 -fi -ac_res=`eval echo '${'$as_ac_Header'}'` - { echo "$as_me:$LINENO: result: $ac_res" >&5 -echo "${ECHO_T}$ac_res" >&6; } -else - # Is the header compilable? -{ echo "$as_me:$LINENO: checking $ac_header usability" >&5 -echo $ECHO_N "checking $ac_header usability... $ECHO_C" >&6; } -cat >conftest.$ac_ext <<_ACEOF -/* confdefs.h. */ -_ACEOF -cat confdefs.h >>conftest.$ac_ext -cat >>conftest.$ac_ext <<_ACEOF -/* end confdefs.h. */ -$ac_includes_default -#include <$ac_header> -_ACEOF -rm -f conftest.$ac_objext -if { (ac_try="$ac_compile" -case "(($ac_try" in - *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; - *) ac_try_echo=$ac_try;; -esac -eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 - (eval "$ac_compile") 2>conftest.er1 - ac_status=$? - grep -v '^ *+' conftest.er1 >conftest.err - rm -f conftest.er1 - cat conftest.err >&5 - echo "$as_me:$LINENO: \$? = $ac_status" >&5 - (exit $ac_status); } && { - test -z "$ac_c_werror_flag" || - test ! -s conftest.err - } && test -s conftest.$ac_objext; then - ac_header_compiler=yes -else - echo "$as_me: failed program was:" >&5 -sed 's/^/| /' conftest.$ac_ext >&5 - - ac_header_compiler=no -fi - -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext -{ echo "$as_me:$LINENO: result: $ac_header_compiler" >&5 -echo "${ECHO_T}$ac_header_compiler" >&6; } - -# Is the header present? -{ echo "$as_me:$LINENO: checking $ac_header presence" >&5 -echo $ECHO_N "checking $ac_header presence... $ECHO_C" >&6; } -cat >conftest.$ac_ext <<_ACEOF -/* confdefs.h. */ -_ACEOF -cat confdefs.h >>conftest.$ac_ext -cat >>conftest.$ac_ext <<_ACEOF -/* end confdefs.h. */ -#include <$ac_header> -_ACEOF -if { (ac_try="$ac_cpp conftest.$ac_ext" -case "(($ac_try" in - *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; - *) ac_try_echo=$ac_try;; -esac -eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 - (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1 - ac_status=$? - grep -v '^ *+' conftest.er1 >conftest.err - rm -f conftest.er1 - cat conftest.err >&5 - echo "$as_me:$LINENO: \$? = $ac_status" >&5 - (exit $ac_status); } >/dev/null && { - test -z "$ac_c_preproc_warn_flag$ac_c_werror_flag" || - test ! -s conftest.err - }; then - ac_header_preproc=yes -else - echo "$as_me: failed program was:" >&5 -sed 's/^/| /' conftest.$ac_ext >&5 - - ac_header_preproc=no -fi - -rm -f conftest.err conftest.$ac_ext -{ echo "$as_me:$LINENO: result: $ac_header_preproc" >&5 -echo "${ECHO_T}$ac_header_preproc" >&6; } - -# So? What about this header? -case $ac_header_compiler:$ac_header_preproc:$ac_c_preproc_warn_flag in - yes:no: ) - { echo "$as_me:$LINENO: WARNING: $ac_header: accepted by the compiler, rejected by the preprocessor!" >&5 -echo "$as_me: WARNING: $ac_header: accepted by the compiler, rejected by the preprocessor!" >&2;} - { echo "$as_me:$LINENO: WARNING: $ac_header: proceeding with the compiler's result" >&5 -echo "$as_me: WARNING: $ac_header: proceeding with the compiler's result" >&2;} - ac_header_preproc=yes - ;; - no:yes:* ) - { echo "$as_me:$LINENO: WARNING: $ac_header: present but cannot be compiled" >&5 -echo "$as_me: WARNING: $ac_header: present but cannot be compiled" >&2;} - { echo "$as_me:$LINENO: WARNING: $ac_header: check for missing prerequisite headers?" >&5 -echo "$as_me: WARNING: $ac_header: check for missing prerequisite headers?" >&2;} - { echo "$as_me:$LINENO: WARNING: $ac_header: see the Autoconf documentation" >&5 -echo "$as_me: WARNING: $ac_header: see the Autoconf documentation" >&2;} - { echo "$as_me:$LINENO: WARNING: $ac_header: section \"Present But Cannot Be Compiled\"" >&5 -echo "$as_me: WARNING: $ac_header: section \"Present But Cannot Be Compiled\"" >&2;} - { echo "$as_me:$LINENO: WARNING: $ac_header: proceeding with the preprocessor's result" >&5 -echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result" >&2;} - { echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5 -echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;} - - ;; -esac -{ echo "$as_me:$LINENO: checking for $ac_header" >&5 -echo $ECHO_N "checking for $ac_header... $ECHO_C" >&6; } -if { as_var=$as_ac_Header; eval "test \"\${$as_var+set}\" = set"; }; then - echo $ECHO_N "(cached) $ECHO_C" >&6 -else - eval "$as_ac_Header=\$ac_header_preproc" -fi -ac_res=`eval echo '${'$as_ac_Header'}'` - { echo "$as_me:$LINENO: result: $ac_res" >&5 -echo "${ECHO_T}$ac_res" >&6; } - -fi -if test `eval echo '${'$as_ac_Header'}'` = yes; then - cat >>confdefs.h <<_ACEOF -#define `echo "HAVE_$ac_header" | $as_tr_cpp` 1 -_ACEOF - has_pgsql_header="true" -else - has_pgsql_header="false" -fi - -done - - if test "$has_pgsql_header" = "true"; then - { echo "$as_me:$LINENO: checking for PQconnectdb in -lpq" >&5 -echo $ECHO_N "checking for PQconnectdb in -lpq... $ECHO_C" >&6; } -if test "${ac_cv_lib_pq_PQconnectdb+set}" = set; then - echo $ECHO_N "(cached) $ECHO_C" >&6 -else - ac_check_lib_save_LIBS=$LIBS -LIBS="-lpq $LIBS" -cat >conftest.$ac_ext <<_ACEOF -/* confdefs.h. */ -_ACEOF -cat confdefs.h >>conftest.$ac_ext -cat >>conftest.$ac_ext <<_ACEOF -/* end confdefs.h. */ - -/* Override any GCC internal prototype to avoid an error. - Use char because int might match the return type of a GCC - builtin and then its argument prototype would still apply. */ -#ifdef __cplusplus -extern "C" -#endif -char PQconnectdb (); -int -main () -{ -return PQconnectdb (); - ; - return 0; -} -_ACEOF -rm -f conftest.$ac_objext conftest$ac_exeext -if { (ac_try="$ac_link" -case "(($ac_try" in - *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; - *) ac_try_echo=$ac_try;; -esac -eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 - (eval "$ac_link") 2>conftest.er1 - ac_status=$? - grep -v '^ *+' conftest.er1 >conftest.err - rm -f conftest.er1 - cat conftest.err >&5 - echo "$as_me:$LINENO: \$? = $ac_status" >&5 - (exit $ac_status); } && { - test -z "$ac_c_werror_flag" || - test ! -s conftest.err - } && test -s conftest$ac_exeext && - $as_test_x conftest$ac_exeext; then - ac_cv_lib_pq_PQconnectdb=yes -else - echo "$as_me: failed program was:" >&5 -sed 's/^/| /' conftest.$ac_ext >&5 - - ac_cv_lib_pq_PQconnectdb=no -fi - -rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ - conftest$ac_exeext conftest.$ac_ext -LIBS=$ac_check_lib_save_LIBS -fi -{ echo "$as_me:$LINENO: result: $ac_cv_lib_pq_PQconnectdb" >&5 -echo "${ECHO_T}$ac_cv_lib_pq_PQconnectdb" >&6; } -if test $ac_cv_lib_pq_PQconnectdb = yes; then - has_pgsql_lib="true" -else - has_pgsql_lib="false" -fi - - if test "$has_pgsql_lib" = "true"; then - PGSQL_LIBS=" -lpq" - save_LIBS="$LIBS" - LIBS="$PGSQL_LIBS $save_LIBS" - cat >conftest.$ac_ext <<_ACEOF + PGSQL_LIBS=" -lpq" + save_LIBS="$LIBS" + LIBS="$PGSQL_LIBS $save_LIBS" + cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext @@ -25342,12 +25131,12 @@ int main () { - int main() - { - PGconn *conn; - conn = PQconnectdb("dbname = postgres"); - (void) PQfinish(conn); - } + int main() + { + PGconn *conn; + conn = PQconnectdb("dbname = postgres"); + (void) PQfinish(conn); + } ; return 0; @@ -25381,9 +25170,10 @@ fi rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ conftest$ac_exeext conftest.$ac_ext - LIBS="$save_LIBS" - if test "$ac_have_pgsql" == "yes"; then - { echo "$as_me:$LINENO: result: PostgreSQL test program built properly." >&5 + LIBS="$save_LIBS" + CFLAGS="$save_CFLAGS" + if test "$ac_have_pgsql" == "yes"; then + { echo "$as_me:$LINENO: result: PostgreSQL test program built properly." >&5 echo "${ECHO_T}PostgreSQL test program built properly." >&6; } @@ -25392,20 +25182,11 @@ cat >>confdefs.h <<\_ACEOF #define HAVE_PGSQL 1 _ACEOF - else - { echo "$as_me:$LINENO: WARNING: *** PostgreSQL test program execution failed." >&5 + else + { echo "$as_me:$LINENO: WARNING: *** PostgreSQL test program execution failed." >&5 echo "$as_me: WARNING: *** PostgreSQL test program execution failed." >&2;} - fi - else - { echo "$as_me:$LINENO: WARNING: libpq not found: PostgreSQL support not available" >&5 -echo "$as_me: WARNING: libpq not found: PostgreSQL support not available" >&2;} - fi - else - { echo "$as_me:$LINENO: WARNING: libpq-fe.h header not found: PostgreSQL support not available" >&5 -echo "$as_me: WARNING: libpq-fe.h header not found: PostgreSQL support not available" >&2;} - fi - CFLAGS="$save_CFLAGS" - fi + fi + fi @@ -26741,7 +26522,7 @@ _ACEOF -ac_config_files="$ac_config_files Makefile config.xml auxdir/Makefile contribs/Makefile contribs/perlapi/Makefile contribs/perlapi/libslurm-perl/Makefile.PL contribs/torque/Makefile src/Makefile src/api/Makefile src/common/Makefile src/sacct/Makefile src/salloc/Makefile src/sbatch/Makefile src/sattach/Makefile src/srun/Makefile src/slurmd/Makefile src/slurmd/slurmd/Makefile src/slurmd/slurmstepd/Makefile src/slurmctld/Makefile src/sbcast/Makefile src/scontrol/Makefile src/scancel/Makefile src/squeue/Makefile src/sinfo/Makefile src/smap/Makefile src/strigger/Makefile src/sview/Makefile src/plugins/Makefile src/plugins/auth/Makefile src/plugins/auth/authd/Makefile src/plugins/auth/munge/Makefile src/plugins/auth/none/Makefile src/plugins/checkpoint/Makefile src/plugins/checkpoint/aix/Makefile src/plugins/checkpoint/none/Makefile src/plugins/checkpoint/ompi/Makefile src/plugins/checkpoint/xlch/Makefile src/plugins/crypto/Makefile src/plugins/crypto/munge/Makefile src/plugins/crypto/openssl/Makefile src/plugins/jobacct_gather/Makefile src/plugins/jobacct_gather/linux/Makefile src/plugins/jobacct_gather/aix/Makefile src/plugins/jobacct_gather/none/Makefile src/plugins/jobacct_storage/Makefile src/plugins/jobacct_storage/filetxt/Makefile src/plugins/jobacct_storage/mysql/Makefile src/plugins/jobacct_storage/pgsql/Makefile src/plugins/jobacct_storage/none/Makefile src/plugins/jobcomp/Makefile src/plugins/jobcomp/filetxt/Makefile src/plugins/jobcomp/none/Makefile src/plugins/jobcomp/script/Makefile src/plugins/jobcomp/mysql/Makefile src/plugins/jobcomp/pgsql/Makefile src/plugins/proctrack/Makefile src/plugins/proctrack/aix/Makefile src/plugins/proctrack/pgid/Makefile src/plugins/proctrack/linuxproc/Makefile src/plugins/proctrack/rms/Makefile src/plugins/proctrack/sgi_job/Makefile src/plugins/sched/Makefile src/plugins/sched/backfill/Makefile src/plugins/sched/builtin/Makefile src/plugins/sched/gang/Makefile src/plugins/sched/hold/Makefile src/plugins/sched/wiki/Makefile src/plugins/sched/wiki2/Makefile src/plugins/select/Makefile src/plugins/select/bluegene/Makefile src/plugins/select/bluegene/block_allocator/Makefile src/plugins/select/bluegene/plugin/Makefile src/plugins/select/linear/Makefile src/plugins/select/cons_res/Makefile src/plugins/switch/Makefile src/plugins/switch/elan/Makefile src/plugins/switch/none/Makefile src/plugins/switch/federation/Makefile src/plugins/mpi/Makefile src/plugins/mpi/mpich1_p4/Makefile src/plugins/mpi/mpich1_shmem/Makefile src/plugins/mpi/mpichgm/Makefile src/plugins/mpi/mpichmx/Makefile src/plugins/mpi/mvapich/Makefile src/plugins/mpi/lam/Makefile src/plugins/mpi/none/Makefile src/plugins/mpi/openmpi/Makefile src/plugins/task/Makefile src/plugins/task/affinity/Makefile src/plugins/task/none/Makefile doc/Makefile doc/man/Makefile doc/html/Makefile doc/html/configurator.html testsuite/Makefile testsuite/expect/Makefile testsuite/slurm_unit/Makefile testsuite/slurm_unit/common/Makefile testsuite/slurm_unit/slurmctld/Makefile testsuite/slurm_unit/slurmd/Makefile testsuite/slurm_unit/api/Makefile testsuite/slurm_unit/api/manual/Makefile" +ac_config_files="$ac_config_files Makefile config.xml auxdir/Makefile contribs/Makefile contribs/perlapi/Makefile contribs/perlapi/libslurm-perl/Makefile.PL contribs/torque/Makefile src/Makefile src/api/Makefile src/common/Makefile src/sacct/Makefile src/salloc/Makefile src/sbatch/Makefile src/sattach/Makefile src/srun/Makefile src/slurmd/Makefile src/slurmd/slurmd/Makefile src/slurmd/slurmstepd/Makefile src/slurmctld/Makefile src/sbcast/Makefile src/scontrol/Makefile src/scancel/Makefile src/squeue/Makefile src/sinfo/Makefile src/smap/Makefile src/strigger/Makefile src/sview/Makefile src/plugins/Makefile src/plugins/auth/Makefile src/plugins/auth/authd/Makefile src/plugins/auth/munge/Makefile src/plugins/auth/none/Makefile src/plugins/checkpoint/Makefile src/plugins/checkpoint/aix/Makefile src/plugins/checkpoint/none/Makefile src/plugins/checkpoint/ompi/Makefile src/plugins/checkpoint/xlch/Makefile src/plugins/crypto/Makefile src/plugins/crypto/munge/Makefile src/plugins/crypto/openssl/Makefile src/plugins/jobacct_gather/Makefile src/plugins/jobacct_gather/linux/Makefile src/plugins/jobacct_gather/aix/Makefile src/plugins/jobacct_gather/none/Makefile src/plugins/jobacct_storage/Makefile src/plugins/jobacct_storage/filetxt/Makefile src/plugins/jobacct_storage/gold/Makefile src/plugins/jobacct_storage/mysql/Makefile src/plugins/jobacct_storage/pgsql/Makefile src/plugins/jobacct_storage/none/Makefile src/plugins/jobcomp/Makefile src/plugins/jobcomp/filetxt/Makefile src/plugins/jobcomp/none/Makefile src/plugins/jobcomp/script/Makefile src/plugins/jobcomp/mysql/Makefile src/plugins/jobcomp/pgsql/Makefile src/plugins/nodeacct_storage/Makefile src/plugins/nodeacct_storage/gold/Makefile src/plugins/nodeacct_storage/mysql/Makefile src/plugins/nodeacct_storage/pgsql/Makefile src/plugins/nodeacct_storage/none/Makefile src/plugins/proctrack/Makefile src/plugins/proctrack/aix/Makefile src/plugins/proctrack/pgid/Makefile src/plugins/proctrack/linuxproc/Makefile src/plugins/proctrack/rms/Makefile src/plugins/proctrack/sgi_job/Makefile src/plugins/sched/Makefile src/plugins/sched/backfill/Makefile src/plugins/sched/builtin/Makefile src/plugins/sched/gang/Makefile src/plugins/sched/hold/Makefile src/plugins/sched/wiki/Makefile src/plugins/sched/wiki2/Makefile src/plugins/select/Makefile src/plugins/select/bluegene/Makefile src/plugins/select/bluegene/block_allocator/Makefile src/plugins/select/bluegene/plugin/Makefile src/plugins/select/linear/Makefile src/plugins/select/cons_res/Makefile src/plugins/switch/Makefile src/plugins/switch/elan/Makefile src/plugins/switch/none/Makefile src/plugins/switch/federation/Makefile src/plugins/mpi/Makefile src/plugins/mpi/mpich1_p4/Makefile src/plugins/mpi/mpich1_shmem/Makefile src/plugins/mpi/mpichgm/Makefile src/plugins/mpi/mpichmx/Makefile src/plugins/mpi/mvapich/Makefile src/plugins/mpi/lam/Makefile src/plugins/mpi/none/Makefile src/plugins/mpi/openmpi/Makefile src/plugins/task/Makefile src/plugins/task/affinity/Makefile src/plugins/task/none/Makefile doc/Makefile doc/man/Makefile doc/html/Makefile doc/html/configurator.html testsuite/Makefile testsuite/expect/Makefile testsuite/slurm_unit/Makefile testsuite/slurm_unit/common/Makefile testsuite/slurm_unit/slurmctld/Makefile testsuite/slurm_unit/slurmd/Makefile testsuite/slurm_unit/api/Makefile testsuite/slurm_unit/api/manual/Makefile" cat >confcache <<\_ACEOF @@ -27503,6 +27284,7 @@ do "src/plugins/jobacct_gather/none/Makefile") CONFIG_FILES="$CONFIG_FILES src/plugins/jobacct_gather/none/Makefile" ;; "src/plugins/jobacct_storage/Makefile") CONFIG_FILES="$CONFIG_FILES src/plugins/jobacct_storage/Makefile" ;; "src/plugins/jobacct_storage/filetxt/Makefile") CONFIG_FILES="$CONFIG_FILES src/plugins/jobacct_storage/filetxt/Makefile" ;; + "src/plugins/jobacct_storage/gold/Makefile") CONFIG_FILES="$CONFIG_FILES src/plugins/jobacct_storage/gold/Makefile" ;; "src/plugins/jobacct_storage/mysql/Makefile") CONFIG_FILES="$CONFIG_FILES src/plugins/jobacct_storage/mysql/Makefile" ;; "src/plugins/jobacct_storage/pgsql/Makefile") CONFIG_FILES="$CONFIG_FILES src/plugins/jobacct_storage/pgsql/Makefile" ;; "src/plugins/jobacct_storage/none/Makefile") CONFIG_FILES="$CONFIG_FILES src/plugins/jobacct_storage/none/Makefile" ;; @@ -27512,6 +27294,11 @@ do "src/plugins/jobcomp/script/Makefile") CONFIG_FILES="$CONFIG_FILES src/plugins/jobcomp/script/Makefile" ;; "src/plugins/jobcomp/mysql/Makefile") CONFIG_FILES="$CONFIG_FILES src/plugins/jobcomp/mysql/Makefile" ;; "src/plugins/jobcomp/pgsql/Makefile") CONFIG_FILES="$CONFIG_FILES src/plugins/jobcomp/pgsql/Makefile" ;; + "src/plugins/nodeacct_storage/Makefile") CONFIG_FILES="$CONFIG_FILES src/plugins/nodeacct_storage/Makefile" ;; + "src/plugins/nodeacct_storage/gold/Makefile") CONFIG_FILES="$CONFIG_FILES src/plugins/nodeacct_storage/gold/Makefile" ;; + "src/plugins/nodeacct_storage/mysql/Makefile") CONFIG_FILES="$CONFIG_FILES src/plugins/nodeacct_storage/mysql/Makefile" ;; + "src/plugins/nodeacct_storage/pgsql/Makefile") CONFIG_FILES="$CONFIG_FILES src/plugins/nodeacct_storage/pgsql/Makefile" ;; + "src/plugins/nodeacct_storage/none/Makefile") CONFIG_FILES="$CONFIG_FILES src/plugins/nodeacct_storage/none/Makefile" ;; "src/plugins/proctrack/Makefile") CONFIG_FILES="$CONFIG_FILES src/plugins/proctrack/Makefile" ;; "src/plugins/proctrack/aix/Makefile") CONFIG_FILES="$CONFIG_FILES src/plugins/proctrack/aix/Makefile" ;; "src/plugins/proctrack/pgid/Makefile") CONFIG_FILES="$CONFIG_FILES src/plugins/proctrack/pgid/Makefile" ;; diff --git a/configure.ac b/configure.ac index 2407a4cf1415a32bc5e39f35aef0802ff835cc54..83b8fea1817c1dff84909f3027eaa1e7a419b2f6 100644 --- a/configure.ac +++ b/configure.ac @@ -311,6 +311,7 @@ AC_CONFIG_FILES([Makefile src/plugins/jobacct_gather/none/Makefile src/plugins/jobacct_storage/Makefile src/plugins/jobacct_storage/filetxt/Makefile + src/plugins/jobacct_storage/gold/Makefile src/plugins/jobacct_storage/mysql/Makefile src/plugins/jobacct_storage/pgsql/Makefile src/plugins/jobacct_storage/none/Makefile @@ -320,6 +321,11 @@ AC_CONFIG_FILES([Makefile src/plugins/jobcomp/script/Makefile src/plugins/jobcomp/mysql/Makefile src/plugins/jobcomp/pgsql/Makefile + src/plugins/nodeacct_storage/Makefile + src/plugins/nodeacct_storage/gold/Makefile + src/plugins/nodeacct_storage/mysql/Makefile + src/plugins/nodeacct_storage/pgsql/Makefile + src/plugins/nodeacct_storage/none/Makefile src/plugins/proctrack/Makefile src/plugins/proctrack/aix/Makefile src/plugins/proctrack/pgid/Makefile diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index 8a2c9cd4ff2386d8ece11e9b89f5206ef16abbde..210cd346d74910b4f06c1264a1751a223881e084 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -1392,7 +1392,7 @@ These tasks initiated outside of SLURM's monitoring or control. SLURM's epilog should be configured to purge these tasks when the job's allocation is relinquished. -See \fIhttps://computing.linux.gov/linux/slurm/quickstart.html#mpi\fR +See \fIhttps://computing.llnl.gov/linux/slurm/quickstart.html#mpi\fR for more information on use of these various MPI implementation with SLURM. diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index e85be2a79c4fa970bc4dc8297f86ef773ec91df8..ad4e8ed5ffeaf65c03761c6063f25409469752b6 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -924,6 +924,7 @@ typedef struct slurm_ctl_conf { time_t boot_time; /* time slurmctld last booted */ uint16_t cache_groups; /* cache /etc/groups to avoid initgroups(2) */ char *checkpoint_type; /* checkpoint plugin type */ + char *cluster_name; /* general name of the entire cluster */ char *control_addr; /* comm path of slurmctld primary server */ char *control_machine; /* name of slurmctld primary server */ char *crypto_type; /* cryptographic signature plugin */ @@ -962,6 +963,12 @@ typedef struct slurm_ctl_conf { * purged from in memory records */ char *mpi_default; /* Default version of MPI in use */ uint16_t msg_timeout; /* message timeout */ + char *node_acct_storage_loc; /* node accounting storage log location */ + char *node_acct_storage_type; /* node accounting storage type */ + char *node_acct_storage_user; /* node accounting storage user */ + char *node_acct_storage_host; /* node accounting storage host */ + char *node_acct_storage_pass; /* node accounting storage password */ + uint32_t node_acct_storage_port;/* node accounting storage port */ char *node_prefix; /* prefix of nodes in partition only set in bluegene clusters NULL otherwise */ char *plugindir; /* pathname to plugins */ diff --git a/src/common/Makefile.am b/src/common/Makefile.am index 68ff83768f10ac48989b1166623105665ba8fd60..bbc95f1dfc08fc43f4d3786283790af31b1dd34b 100644 --- a/src/common/Makefile.am +++ b/src/common/Makefile.am @@ -67,6 +67,7 @@ libcommon_la_SOURCES = \ slurm_jobacct_gather.c slurm_jobacct_gather.h \ slurm_jobacct_storage.c slurm_jobacct_storage.h \ slurm_jobcomp.c slurm_jobcomp.h \ + slurm_nodeacct_storage.c slurm_nodeacct_storage.h \ switch.c switch.h \ arg_desc.c arg_desc.h \ macros.h \ diff --git a/src/common/Makefile.in b/src/common/Makefile.in index 96f8b247ac479c0dfb3b769b0141fe8e2bc42279..580ebd31e62cb6ef3a1663d7c56f9127fba18835 100644 --- a/src/common/Makefile.in +++ b/src/common/Makefile.in @@ -89,7 +89,8 @@ am__libcommon_la_SOURCES_DIST = xmalloc.c xmalloc.h xassert.c \ slurm_auth.c slurm_auth.h jobacct_common.c jobacct_common.h \ slurm_jobacct_gather.c slurm_jobacct_gather.h \ slurm_jobacct_storage.c slurm_jobacct_storage.h \ - slurm_jobcomp.c slurm_jobcomp.h switch.c switch.h arg_desc.c \ + slurm_jobcomp.c slurm_jobcomp.h slurm_nodeacct_storage.c \ + slurm_nodeacct_storage.h switch.c switch.h arg_desc.c \ arg_desc.h macros.h malloc.c malloc.h getopt.h getopt.c \ getopt1.c unsetenv.c unsetenv.h slurm_selecttype_info.c \ slurm_resource_info.c slurm_resource_info.h hostlist.c \ @@ -122,7 +123,8 @@ am_libcommon_la_OBJECTS = libcommon_la-xmalloc.lo \ libcommon_la-jobacct_common.lo \ libcommon_la-slurm_jobacct_gather.lo \ libcommon_la-slurm_jobacct_storage.lo \ - libcommon_la-slurm_jobcomp.lo libcommon_la-switch.lo \ + libcommon_la-slurm_jobcomp.lo \ + libcommon_la-slurm_nodeacct_storage.lo libcommon_la-switch.lo \ libcommon_la-arg_desc.lo libcommon_la-malloc.lo \ libcommon_la-getopt.lo libcommon_la-getopt1.lo \ $(am__objects_1) libcommon_la-slurm_selecttype_info.lo \ @@ -392,6 +394,7 @@ libcommon_la_SOURCES = \ slurm_jobacct_gather.c slurm_jobacct_gather.h \ slurm_jobacct_storage.c slurm_jobacct_storage.h \ slurm_jobcomp.c slurm_jobcomp.h \ + slurm_nodeacct_storage.c slurm_nodeacct_storage.h \ switch.c switch.h \ arg_desc.c arg_desc.h \ macros.h \ @@ -528,6 +531,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libcommon_la-slurm_jobacct_gather.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libcommon_la-slurm_jobacct_storage.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libcommon_la-slurm_jobcomp.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libcommon_la-slurm_nodeacct_storage.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libcommon_la-slurm_protocol_api.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libcommon_la-slurm_protocol_defs.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libcommon_la-slurm_protocol_pack.Plo@am__quote@ @@ -845,6 +849,13 @@ libcommon_la-slurm_jobcomp.lo: slurm_jobcomp.c @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libcommon_la_CFLAGS) $(CFLAGS) -c -o libcommon_la-slurm_jobcomp.lo `test -f 'slurm_jobcomp.c' || echo '$(srcdir)/'`slurm_jobcomp.c +libcommon_la-slurm_nodeacct_storage.lo: slurm_nodeacct_storage.c +@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libcommon_la_CFLAGS) $(CFLAGS) -MT libcommon_la-slurm_nodeacct_storage.lo -MD -MP -MF $(DEPDIR)/libcommon_la-slurm_nodeacct_storage.Tpo -c -o libcommon_la-slurm_nodeacct_storage.lo `test -f 'slurm_nodeacct_storage.c' || echo '$(srcdir)/'`slurm_nodeacct_storage.c +@am__fastdepCC_TRUE@ mv -f $(DEPDIR)/libcommon_la-slurm_nodeacct_storage.Tpo $(DEPDIR)/libcommon_la-slurm_nodeacct_storage.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='slurm_nodeacct_storage.c' object='libcommon_la-slurm_nodeacct_storage.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libcommon_la_CFLAGS) $(CFLAGS) -c -o libcommon_la-slurm_nodeacct_storage.lo `test -f 'slurm_nodeacct_storage.c' || echo '$(srcdir)/'`slurm_nodeacct_storage.c + libcommon_la-switch.lo: switch.c @am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libcommon_la_CFLAGS) $(CFLAGS) -MT libcommon_la-switch.lo -MD -MP -MF $(DEPDIR)/libcommon_la-switch.Tpo -c -o libcommon_la-switch.lo `test -f 'switch.c' || echo '$(srcdir)/'`switch.c @am__fastdepCC_TRUE@ mv -f $(DEPDIR)/libcommon_la-switch.Tpo $(DEPDIR)/libcommon_la-switch.Plo diff --git a/src/common/read_config.c b/src/common/read_config.c index 1efac6f7f67163ab9bce13e4bde47a962a92b26c..d3390942b96b82377065f42fc0986a7b786659a6 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -127,6 +127,7 @@ s_p_options_t slurm_conf_options[] = { {"BackupController", S_P_STRING}, {"ControlAddr", S_P_STRING}, {"ControlMachine", S_P_STRING}, + {"ClusterName", S_P_STRING}, {"CryptoType", S_P_STRING}, {"DefMemPerTask", S_P_UINT32}, {"Epilog", S_P_STRING}, @@ -162,6 +163,12 @@ s_p_options_t slurm_conf_options[] = { {"MaxMemPerTask", S_P_UINT32}, {"MessageTimeout", S_P_UINT16}, {"MinJobAge", S_P_UINT16}, + {"NodeAcctStorageLoc", S_P_STRING}, + {"NodeAcctStorageType", S_P_STRING}, + {"NodeAcctStorageHost", S_P_STRING}, + {"NodeAcctStorageUser", S_P_STRING}, + {"NodeAcctStoragePass", S_P_STRING}, + {"NodeAcctStoragePort", S_P_UINT32}, {"MpichGmDirectSupport", S_P_LONG}, {"MpiDefault", S_P_STRING}, {"PluginDir", S_P_STRING}, @@ -1107,6 +1114,7 @@ init_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr) xfree (ctl_conf_ptr->authtype); ctl_conf_ptr->cache_groups = (uint16_t) NO_VAL; xfree (ctl_conf_ptr->checkpoint_type); + xfree (ctl_conf_ptr->cluster_name); xfree (ctl_conf_ptr->backup_addr); xfree (ctl_conf_ptr->backup_controller); xfree (ctl_conf_ptr->control_addr); @@ -1143,6 +1151,12 @@ init_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr) xfree (ctl_conf_ptr->mpi_default); ctl_conf_ptr->msg_timeout = (uint16_t) NO_VAL; ctl_conf_ptr->next_job_id = (uint32_t) NO_VAL; + xfree (ctl_conf_ptr->node_acct_storage_loc); + xfree (ctl_conf_ptr->node_acct_storage_type); + xfree (ctl_conf_ptr->node_acct_storage_user); + xfree (ctl_conf_ptr->node_acct_storage_host); + xfree (ctl_conf_ptr->node_acct_storage_pass); + ctl_conf_ptr->node_acct_storage_port = 0; xfree (ctl_conf_ptr->plugindir); xfree (ctl_conf_ptr->plugstack); ctl_conf_ptr->private_data = 0; @@ -1431,6 +1445,8 @@ validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl) conf->backup_addr = xstrdup(conf->backup_controller); } + s_p_get_string(&conf->cluster_name, "ClusterName", hashtbl); + if (!s_p_get_string(&conf->control_machine, "ControlMachine", hashtbl)) fatal ("validate_and_set_defaults: " "ControlMachine not specified."); @@ -1615,6 +1631,37 @@ validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl) if (!s_p_get_string(&conf->mpi_default, "MpiDefault", hashtbl)) conf->mpi_default = xstrdup(DEFAULT_MPI_DEFAULT); + /* JobAcctStorageLoc replaces NodeAcctLogFile since it now represents + * the database name also depending on the storage type you + * use so we still check NodeAcctLogFile for the same thing + */ + if (!s_p_get_string(&conf->node_acct_storage_loc, + "NodeAcctStorageLoc", hashtbl)) + conf->node_acct_storage_loc = + xstrdup(DEFAULT_NODE_ACCT_STORAGE_LOC); + + if (!s_p_get_string(&conf->node_acct_storage_type, + "NodeAcctStorageType", hashtbl)) { + conf->node_acct_storage_type = + xstrdup(DEFAULT_NODE_ACCT_STORAGE_TYPE); + } + if (!s_p_get_string(&conf->node_acct_storage_host, + "NodeAcctStorageHost", hashtbl)) + conf->node_acct_storage_host = + xstrdup(DEFAULT_NODE_ACCT_STORAGE_HOST); + if (!s_p_get_string(&conf->node_acct_storage_user, + "NodeAcctStorageUser", hashtbl)) + conf->node_acct_storage_user = + xstrdup(DEFAULT_NODE_ACCT_STORAGE_USER); + if (!s_p_get_string(&conf->node_acct_storage_pass, + "NodeAcctStoragePass", hashtbl)) + conf->node_acct_storage_pass = + xstrdup(DEFAULT_NODE_ACCT_STORAGE_PASS); + if (!s_p_get_uint32(&conf->node_acct_storage_port, + "NodeAcctStoragePort", hashtbl)) + conf->node_acct_storage_port = + DEFAULT_NODE_ACCT_STORAGE_PORT; + if (!s_p_get_string(&conf->plugindir, "PluginDir", hashtbl)) conf->plugindir = xstrdup(default_plugin_path); diff --git a/src/common/read_config.h b/src/common/read_config.h index cb28d9d083cd6fade6ab5a166ffcaf9af0e0bdfe..06cd578469b37c205fd27582155fa4afc8c7db58 100644 --- a/src/common/read_config.h +++ b/src/common/read_config.h @@ -60,7 +60,7 @@ extern char *default_plugstack; #define DEFAULT_JOB_ACCT_GATHER_FREQ 30 #define DEFAULT_JOB_ACCT_STORAGE_TYPE "jobacct_storage/filetxt" #define JOB_ACCT_STORAGE_TYPE_NONE "jobacct_storage/none" -#define DEFAULT_JOB_ACCT_STORAGE_LOC "/var/log/slurm_accounting.log" +#define DEFAULT_JOB_ACCT_STORAGE_LOC "/var/log/slurm_jobacct.log" #define DEFAULT_JOB_ACCT_STORAGE_HOST "localhost" #define DEFAULT_JOB_ACCT_STORAGE_USER "root" #define DEFAULT_JOB_ACCT_STORAGE_PASS "" @@ -80,6 +80,13 @@ extern char *default_plugstack; #define DEFAULT_MIN_JOB_AGE 300 #define DEFAULT_MPI_DEFAULT "none" #define DEFAULT_MSG_TIMEOUT 10 +#define DEFAULT_NODE_ACCT_STORAGE_TYPE "nodeacct_storage/none" +#define NODE_ACCT_STORAGE_TYPE_NONE "nodeacct_storage/none" +#define DEFAULT_NODE_ACCT_STORAGE_LOC "/var/log/slurm_nodeacct.log" +#define DEFAULT_NODE_ACCT_STORAGE_HOST "localhost" +#define DEFAULT_NODE_ACCT_STORAGE_USER "root" +#define DEFAULT_NODE_ACCT_STORAGE_PASS "" +#define DEFAULT_NODE_ACCT_STORAGE_PORT 0 #ifdef HAVE_AIX /* AIX specific default configuration parameters */ # define DEFAULT_CHECKPOINT_TYPE "checkpoint/aix" # define DEFAULT_PROCTRACK_TYPE "proctrack/aix" diff --git a/src/common/slurm_nodeacct_storage.c b/src/common/slurm_nodeacct_storage.c new file mode 100644 index 0000000000000000000000000000000000000000..22a073fae34ea7f492f18fcd2f406108e2c32ab9 --- /dev/null +++ b/src/common/slurm_nodeacct_storage.c @@ -0,0 +1,261 @@ +/*****************************************************************************\ + * slurm_nodeacct_storage.c - storage plugin wrapper. + * + * $Id: slurm_nodeacct_storage.c 10744 2007-01-11 20:09:18Z da $ + ***************************************************************************** + * Copyright (C) 2002-2006 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Danny Aubke <da@llnl.gov>. + * UCRL-CODE-226842. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include <pthread.h> + +#include "src/common/list.h" +#include "src/common/slurm_nodeacct_storage.h" +#include "src/common/plugin.h" +#include "src/common/plugrack.h" +#include "src/common/slurm_protocol_api.h" +#include "src/common/xstring.h" +#include "src/slurmctld/slurmctld.h" + +/* + * Local data + */ + +typedef struct slurm_nodeacct_storage_ops { + int (*node_down) (struct node_record *node_ptr, + time_t event_time, + char *reason); + int (*node_up) (struct node_record *node_ptr, + time_t event_time); + int (*cluster_procs) (uint32_t procs, time_t event_time); +} slurm_nodeacct_storage_ops_t; + +typedef struct slurm_nodeacct_storage_context { + char *nodeacct_storage_type; + plugrack_t plugin_list; + plugin_handle_t cur_plugin; + int nodeacct_storage_errno; + slurm_nodeacct_storage_ops_t ops; +} slurm_nodeacct_storage_context_t; + +static slurm_nodeacct_storage_context_t * g_nodeacct_storage_context = NULL; +static pthread_mutex_t g_nodeacct_storage_context_lock = + PTHREAD_MUTEX_INITIALIZER; + +/* + * Local functions + */ +static slurm_nodeacct_storage_ops_t *_nodeacct_storage_get_ops( + slurm_nodeacct_storage_context_t *c); +static slurm_nodeacct_storage_context_t *_nodeacct_storage_context_create( + const char *nodeacct_storage_type); +static int _nodeacct_storage_context_destroy( + slurm_nodeacct_storage_context_t *c); + +/* + * Locate and load the appropriate plugin + */ +static slurm_nodeacct_storage_ops_t * _nodeacct_storage_get_ops( + slurm_nodeacct_storage_context_t *c) +{ + /* + * Must be synchronized with slurm_nodeacct_storage_ops_t above. + */ + static const char *syms[] = { + "nodeacct_storage_p_node_down", + "nodeacct_storage_p_node_up", + "nodeacct_storage_p_cluster_procs" + }; + int n_syms = sizeof( syms ) / sizeof( char * ); + + /* Get plugin list. */ + if ( c->plugin_list == NULL ) { + char *plugin_dir; + c->plugin_list = plugrack_create(); + if ( c->plugin_list == NULL ) { + error( "cannot create plugin manager" ); + return NULL; + } + plugrack_set_major_type( c->plugin_list, "nodeacct_storage" ); + plugrack_set_paranoia( c->plugin_list, + PLUGRACK_PARANOIA_NONE, + 0 ); + plugin_dir = slurm_get_plugin_dir(); + plugrack_read_dir( c->plugin_list, plugin_dir ); + xfree(plugin_dir); + } + + c->cur_plugin = plugrack_use_by_type( c->plugin_list, + c->nodeacct_storage_type ); + if ( c->cur_plugin == PLUGIN_INVALID_HANDLE ) { + error( "cannot find nodeacct_storage plugin for %s", + c->nodeacct_storage_type ); + return NULL; + } + + /* Dereference the API. */ + if ( plugin_get_syms( c->cur_plugin, + n_syms, + syms, + (void **) &c->ops ) < n_syms ) { + error( "incomplete nodeacct_storage plugin detected" ); + return NULL; + } + + return &c->ops; +} + +/* + * Create a nodeacct_storage context + */ +static slurm_nodeacct_storage_context_t *_nodeacct_storage_context_create(const char *nodeacct_storage_type) +{ + slurm_nodeacct_storage_context_t *c; + + if ( nodeacct_storage_type == NULL ) { + debug3( "_nodeacct_storage_context_create: no uler type" ); + return NULL; + } + + c = xmalloc( sizeof( slurm_nodeacct_storage_context_t ) ); + c->nodeacct_storage_type = xstrdup( nodeacct_storage_type ); + c->plugin_list = NULL; + c->cur_plugin = PLUGIN_INVALID_HANDLE; + c->nodeacct_storage_errno = SLURM_SUCCESS; + + return c; +} + +/* + * Destroy a nodeacct_storage context + */ +static int _nodeacct_storage_context_destroy( + slurm_nodeacct_storage_context_t *c ) +{ + /* + * Must check return code here because plugins might still + * be loaded and active. + */ + if ( c->plugin_list ) { + if ( plugrack_destroy( c->plugin_list ) != SLURM_SUCCESS ) { + return SLURM_ERROR; + } + } + + xfree( c->nodeacct_storage_type ); + xfree( c ); + + return SLURM_SUCCESS; +} + +/* + * Initialize context for nodeacct_storage plugin + */ +extern int slurm_nodeacct_storage_init(void) +{ + int retval = SLURM_SUCCESS; + char *nodeacct_storage_type = NULL; + + slurm_mutex_lock( &g_nodeacct_storage_context_lock ); + + if ( g_nodeacct_storage_context ) + goto done; + + nodeacct_storage_type = slurm_get_nodeacct_storage_type(); + g_nodeacct_storage_context = + _nodeacct_storage_context_create(nodeacct_storage_type); + if ( g_nodeacct_storage_context == NULL ) { + error( "cannot create nodeacct_storage context for %s", + nodeacct_storage_type ); + retval = SLURM_ERROR; + goto done; + } + + if ( _nodeacct_storage_get_ops( g_nodeacct_storage_context ) == NULL ) { + error( "cannot resolve nodeacct_storage plugin operations" ); + _nodeacct_storage_context_destroy( g_nodeacct_storage_context ); + g_nodeacct_storage_context = NULL; + retval = SLURM_ERROR; + } + + done: + slurm_mutex_unlock( &g_nodeacct_storage_context_lock ); + xfree(nodeacct_storage_type); + return retval; +} + +extern int slurm_nodeacct_storage_fini(void) +{ + int rc; + + if (!g_nodeacct_storage_context) + return SLURM_SUCCESS; + + rc = _nodeacct_storage_context_destroy( g_nodeacct_storage_context ); + g_nodeacct_storage_context = NULL; + return rc; +} + +extern int nodeacct_storage_g_node_down(struct node_record *node_ptr, + time_t event_time, + char *reason) +{ + if (slurm_nodeacct_storage_init() < 0) + return SLURM_ERROR; + return (*(g_nodeacct_storage_context->ops.node_down)) + (node_ptr, event_time, reason); +} + +extern int nodeacct_storage_g_node_up(struct node_record *node_ptr, + time_t event_time) +{ + if (slurm_nodeacct_storage_init() < 0) + return SLURM_ERROR; + return (*(g_nodeacct_storage_context->ops.node_up)) + (node_ptr, event_time); +} + + +extern int nodeacct_storage_g_cluster_procs(uint32_t procs, time_t event_time) +{ + if (slurm_nodeacct_storage_init() < 0) + return SLURM_ERROR; + return (*(g_nodeacct_storage_context->ops.cluster_procs)) + (procs, event_time); +} + + diff --git a/src/common/slurm_nodeacct_storage.h b/src/common/slurm_nodeacct_storage.h new file mode 100644 index 0000000000000000000000000000000000000000..ea6378635458de1d6d076e581a107132ba4cbb05 --- /dev/null +++ b/src/common/slurm_nodeacct_storage.h @@ -0,0 +1,61 @@ +/*****************************************************************************\ + * slurm_nodeacct_storage.h - Define storage plugin functions. + * + * $Id: slurm_nodeacct_storage.h 10574 2006-12-15 23:38:29Z jette $ + ***************************************************************************** + * Copyright (C) 2004-2008 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Danny Auble <da@llnl.gov> + * UCRL-CODE-226842. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#ifndef _SLURM_NODEACCT_STORAGE_H +#define _SLURM_NODEACCT_STORAGE_H + +#include "src/common/list.h" +#include "src/slurmctld/slurmctld.h" +#include <slurm/slurm.h> +#include <slurm/slurm_errno.h> + +extern int slurm_nodeacct_storage_init(void); /* load the plugin */ +extern int slurm_nodeacct_storage_fini(void); /* unload the plugin */ + + +extern int nodeacct_storage_g_node_down(struct node_record *node_ptr, + time_t event_time, + char *reason); + +extern int nodeacct_storage_g_node_up(struct node_record *node_ptr, + time_t event_time); + +extern int nodeacct_storage_g_cluster_procs(uint32_t procs, time_t event_time); + +#endif /*_SLURM_NODEACCT_STORAGE_H*/ diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index a730c8910cf17b78d5ec95b15773e84af8a21705..f9017473630e0c0a36128caa683032bda7aa405e 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -274,6 +274,21 @@ extern char *slurm_get_checkpoint_type(void) return checkpoint_type; } +/* slurm_get_cluster_name + * returns the cluster name from slurmctld_conf object + * RET char * - cluster name, MUST be xfreed by caller + */ +char *slurm_get_cluster_name(void) +{ + char *name; + slurm_ctl_conf_t *conf; + + conf = slurm_conf_lock(); + name = xstrdup(conf->cluster_name); + slurm_conf_unlock(); + return name; +} + /* slurm_get_crypto_type * returns the crypto_type from slurmctld_conf object * RET char * - crypto type, MUST be xfreed by caller @@ -576,6 +591,97 @@ uint32_t slurm_get_jobcomp_port(void) } +/* slurm_get_nodeacct_storage_type + * returns the storage type from slurmctld_conf object + * RET char * - storage type, MUST be xfreed by caller + */ +char *slurm_get_nodeacct_storage_type(void) +{ + char *storage_type; + slurm_ctl_conf_t *conf; + + conf = slurm_conf_lock(); + storage_type = xstrdup(conf->node_acct_storage_type); + slurm_conf_unlock(); + return storage_type; +} + +/* slurm_get_nodeacct_storage_loc + * returns the job accounting loc from the slurmctld_conf object + * RET char * - job accounting loc, MUST be xfreed by caller + */ +char *slurm_get_nodeacct_storage_loc(void) +{ + char *nodeacct_loc; + slurm_ctl_conf_t *conf; + + conf = slurm_conf_lock(); + nodeacct_loc = xstrdup(conf->node_acct_storage_loc); + slurm_conf_unlock(); + return nodeacct_loc; +} + +/* slurm_get_nodeacct_storage_user + * returns the storage user from slurmctld_conf object + * RET char * - storage user, MUST be xfreed by caller + */ +char *slurm_get_nodeacct_storage_user(void) +{ + char *storage_user; + slurm_ctl_conf_t *conf; + + conf = slurm_conf_lock(); + storage_user = xstrdup(conf->node_acct_storage_user); + slurm_conf_unlock(); + return storage_user; +} + +/* slurm_get_nodeacct_storage_host + * returns the storage host from slurmctld_conf object + * RET char * - storage host, MUST be xfreed by caller + */ +char *slurm_get_nodeacct_storage_host(void) +{ + char *storage_host; + slurm_ctl_conf_t *conf; + + conf = slurm_conf_lock(); + storage_host = xstrdup(conf->node_acct_storage_host); + slurm_conf_unlock(); + return storage_host; +} + +/* slurm_get_nodeacct_storage_pass + * returns the storage password from slurmctld_conf object + * RET char * - storage password, MUST be xfreed by caller + */ +char *slurm_get_nodeacct_storage_pass(void) +{ + char *storage_pass; + slurm_ctl_conf_t *conf; + + conf = slurm_conf_lock(); + storage_pass = xstrdup(conf->node_acct_storage_pass); + slurm_conf_unlock(); + return storage_pass; +} + +/* slurm_get_nodeacct_storage_port + * returns the storage port from slurmctld_conf object + * RET uint32_t - storage port + */ +uint32_t slurm_get_nodeacct_storage_port(void) +{ + uint32_t storage_port; + slurm_ctl_conf_t *conf; + + conf = slurm_conf_lock(); + storage_port = conf->node_acct_storage_port; + slurm_conf_unlock(); + return storage_port; + +} + /* slurm_get_proctrack_type * get ProctrackType from slurmctld_conf object * RET char * - proctrack type, MUST be xfreed by caller diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h index 10c7d9720db4338dd1bc27e03e5730572d71d4d9..9b0115b17d7347a0ef0b4daf40269d23e9d69818 100644 --- a/src/common/slurm_protocol_api.h +++ b/src/common/slurm_protocol_api.h @@ -155,6 +155,12 @@ extern int slurm_set_auth_type(char *auth_type); */ extern char *slurm_get_checkpoint_type(void); +/* slurm_get_cluster_name + * returns the cluster name from slurmctld_conf object + * RET char * - cluster name, MUST be xfreed by caller + */ +char *slurm_get_cluster_name(void); + /* slurm_get_crypto_type * returns the crypto_type from slurmctld_conf object * RET char * - crypto type, MUST be xfreed by caller @@ -260,6 +266,42 @@ char *slurm_get_jobcomp_pass(void); */ uint32_t slurm_get_jobcomp_port(void); +/* slurm_get_nodeacct_storage_type + * returns the node accounting type from slurmctld_conf object + * RET char * - node accounting type, MUST be xfreed by caller + */ +char *slurm_get_nodeacct_storage_type(void); + +/* slurm_get_nodeacct_storage_loc + * returns the node accounting loc from slurmctld_conf object + * RET char * - node accounting location, MUST be xfreed by caller + */ +char *slurm_get_nodeacct_storage_loc(void); + +/* slurm_get_nodeacct_storage_user + * returns the storage user from slurmctld_conf object + * RET char * - storage user, MUST be xfreed by caller + */ +char *slurm_get_nodeacct_storage_user(void); + +/* slurm_get_nodeacct_storage_host + * returns the storage host from slurmctld_conf object + * RET char * - storage host, MUST be xfreed by caller + */ +char *slurm_get_nodeacct_storage_host(void); + +/* slurm_get_nodeacct_storage_pass + * returns the storage password from slurmctld_conf object + * RET char * - storage password, MUST be xfreed by caller + */ +char *slurm_get_nodeacct_storage_pass(void); + +/* slurm_get_nodeacct_storage_port + * returns the storage port from slurmctld_conf object + * RET uint32_t - storage port + */ +uint32_t slurm_get_nodeacct_storage_port(void); + /* slurm_get_propagate_prio_process * return the PropagatePrioProcess flag from slurmctld_conf object */ diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 624a99a2841c0097eeef162ea0e2ed8d6d066827..09ae189cf6e662177b7868a0676afff89d47118c 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -889,6 +889,7 @@ void slurm_free_ctl_conf(slurm_ctl_conf_info_msg_t * config_ptr) xfree(config_ptr->backup_addr); xfree(config_ptr->backup_controller); xfree(config_ptr->checkpoint_type); + xfree(config_ptr->cluster_name); xfree(config_ptr->control_addr); xfree(config_ptr->control_machine); xfree(config_ptr->crypto_type); @@ -908,6 +909,11 @@ void slurm_free_ctl_conf(slurm_ctl_conf_info_msg_t * config_ptr) xfree(config_ptr->job_credential_public_certificate); xfree(config_ptr->mail_prog); xfree(config_ptr->mpi_default); + xfree(config_ptr->node_acct_storage_loc); + xfree(config_ptr->node_acct_storage_type); + xfree(config_ptr->node_acct_storage_user); + xfree(config_ptr->node_acct_storage_host); + xfree(config_ptr->node_acct_storage_pass); xfree(config_ptr->node_prefix); xfree(config_ptr->plugindir); xfree(config_ptr->plugstack); diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am index 991be7545fcecb2100a8b54011295a6a43205cae..dd110dfc7bc4fba6a4ddf4d10b5601e890891c48 100644 --- a/src/plugins/Makefile.am +++ b/src/plugins/Makefile.am @@ -1 +1 @@ -SUBDIRS = auth checkpoint crypto jobacct_gather jobacct_storage jobcomp mpi proctrack sched select switch task +SUBDIRS = auth checkpoint crypto jobacct_gather jobacct_storage jobcomp mpi nodeacct_storage proctrack sched select switch task diff --git a/src/plugins/Makefile.in b/src/plugins/Makefile.in index 2224a33012cfb26b59f9c8ee34de30dec5a9491b..78a374be1134130ca045b14c32699a06a7449591 100644 --- a/src/plugins/Makefile.in +++ b/src/plugins/Makefile.in @@ -241,7 +241,7 @@ target_os = @target_os@ target_vendor = @target_vendor@ top_builddir = @top_builddir@ top_srcdir = @top_srcdir@ -SUBDIRS = auth checkpoint crypto jobacct_gather jobacct_storage jobcomp mpi proctrack sched select switch task +SUBDIRS = auth checkpoint crypto jobacct_gather jobacct_storage jobcomp mpi nodeacct_storage proctrack sched select switch task all: all-recursive .SUFFIXES: diff --git a/src/plugins/jobacct_storage/mysql/jobacct_storage_mysql.c b/src/plugins/jobacct_storage/mysql/jobacct_storage_mysql.c index 6548a1fddbb12997f1277ad07346217a458ad157..eaadd93fc1dcd78cc6664e3a92de258f4c44ba6d 100644 --- a/src/plugins/jobacct_storage/mysql/jobacct_storage_mysql.c +++ b/src/plugins/jobacct_storage/mysql/jobacct_storage_mysql.c @@ -769,9 +769,9 @@ extern int jobacct_storage_p_suspend(struct job_record *job_ptr) * note List needs to be freed when called */ extern void jobacct_storage_p_get_jobs(List job_list, - List selected_steps, - List selected_parts, - void *params) + List selected_steps, + List selected_parts, + void *params) { #ifdef HAVE_MYSQL if(!jobacct_mysql_db || mysql_ping(jobacct_mysql_db) != 0) { diff --git a/src/plugins/nodeacct_storage/Makefile.am b/src/plugins/nodeacct_storage/Makefile.am new file mode 100644 index 0000000000000000000000000000000000000000..317de56f5b11384ec181beed86da645bd1b2ae8a --- /dev/null +++ b/src/plugins/nodeacct_storage/Makefile.am @@ -0,0 +1,3 @@ +# Makefile for storage plugins + +SUBDIRS = gold mysql none pgsql diff --git a/src/plugins/nodeacct_storage/gold/Makefile.am b/src/plugins/nodeacct_storage/gold/Makefile.am new file mode 100644 index 0000000000000000000000000000000000000000..92fe6aa8843e50f1251a74c3f41d16db5db619eb --- /dev/null +++ b/src/plugins/nodeacct_storage/gold/Makefile.am @@ -0,0 +1,13 @@ +# Makefile for nodeacct_storage/gold plugin + +AUTOMAKE_OPTIONS = foreign + +PLUGIN_FLAGS = -module -avoid-version --export-dynamic + +INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common + +pkglib_LTLIBRARIES = nodeacct_storage_gold.la + +# Null job completion logging plugin. +nodeacct_storage_gold_la_SOURCES = nodeacct_storage_gold.c +nodeacct_storage_gold_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) diff --git a/src/plugins/nodeacct_storage/gold/nodeacct_storage_none.c b/src/plugins/nodeacct_storage/gold/nodeacct_storage_none.c new file mode 100644 index 0000000000000000000000000000000000000000..c73cdaabc19e4ab797bfa8ff407e2fda1d0a24b4 --- /dev/null +++ b/src/plugins/nodeacct_storage/gold/nodeacct_storage_none.c @@ -0,0 +1,383 @@ +/*****************************************************************************\ + * nodeacct_storage_none.c - NO-OP slurm job completion logging plugin. + ***************************************************************************** + * Copyright (C) 2002-2008 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Danny Auble <da@llnl.gov> + * UCRL-CODE-226842. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#if HAVE_STDINT_H +# include <stdint.h> +#endif +#if HAVE_INTTYPES_H +# include <inttypes.h> +#endif + +#include <stdio.h> +#include <slurm/slurm_errno.h> + +#include "src/slurmctld/slurmctld.h" +#include "src/common/slurm_nodeacct_storage.h" + +/* + * These variables are required by the generic plugin interface. If they + * are not found in the plugin, the plugin loader will ignore it. + * + * plugin_name - a string giving a human-readable description of the + * plugin. There is no maximum length, but the symbol must refer to + * a valid string. + * + * plugin_type - a string suggesting the type of the plugin or its + * applicability to a particular form of data or method of data handling. + * If the low-level plugin API is used, the contents of this string are + * unimportant and may be anything. SLURM uses the higher-level plugin + * interface which requires this string to be of the form + * + * <application>/<method> + * + * where <application> is a description of the intended application of + * the plugin (e.g., "jobacct" for SLURM job completion logging) and <method> + * is a description of how this plugin satisfies that application. SLURM will + * only load job completion logging plugins if the plugin_type string has a + * prefix of "jobacct/". + * + * plugin_version - an unsigned 32-bit integer giving the version number + * of the plugin. If major and minor revisions are desired, the major + * version number may be multiplied by a suitable magnitude constant such + * as 100 or 1000. Various SLURM versions will likely require a certain + * minimum versions for their plugins as the job accounting API + * matures. + */ +const char plugin_name[] = "Node accounting storage NOT_INVOKED plugin"; +const char plugin_type[] = "nodeacct_storage/none"; +const uint32_t plugin_version = 100; + +#define DEFAULT_NODEACCT_LOC "localhost" + +/* + * init() is called when the plugin is loaded, before any other functions + * are called. Put global initialization here. + */ +extern int init ( void ) +{ + verbose("%s loaded", plugin_name); + return SLURM_SUCCESS; +} + +extern int fini ( void ) +{ + return SLURM_SUCCESS; +} + +extern int nodeacct_storage_p_node_down(struct node_record *node_ptr, + time_t event_time, + char *reason) +{ + uint16_t cpus; + int rc = SLURM_ERROR; + gold_request_t *gold_request = NULL; + gold_response_t *gold_response = NULL; + char tmp_buff[50]; + + if (slurmctld_conf.fast_schedule) + cpus = node_ptr->config_ptr->cpus; + else + cpus = node_ptr->cpus; + +#if _DEBUG + slurm_make_time_str(&event_time, tmp_buff, sizeof(tmp_buff)); + info("Node_acct_down: %s at %s with %u cpus due to %s", + node_ptr->name, tmp_buff, cpus, node_ptr->reason); +#endif + /* If the node was already down end that record since the + * reason will most likely be different + */ + + gold_request = create_gold_request(GOLD_OBJECT_EVENT, + GOLD_ACTION_MODIFY); + if(!gold_request) + return rc; + + gold_request_add_condition(gold_request, "Machine", cluster_name, + GOLD_OPERATOR_NONE); + gold_request_add_condition(gold_request, "EndTime", "0", + GOLD_OPERATOR_NONE); + gold_request_add_condition(gold_request, "Name", node_ptr->name, + GOLD_OPERATOR_NONE); + + snprintf(tmp_buff, sizeof(tmp_buff), "%d", ((int)event_time - 1)); + gold_request_add_assignment(gold_request, "EndTime", tmp_buff); + + gold_response = get_gold_response(gold_request); + destroy_gold_request(gold_request); + + if(!gold_response) { + error("jobacct_p_cluster_procs: no response received"); + return rc; + } + + if(gold_response->rc) { + error("gold_response has non-zero rc(%d): %s", + gold_response->rc, + gold_response->message); + destroy_gold_response(gold_response); + return rc; + } + destroy_gold_response(gold_response); + + /* now add the new one */ + gold_request = create_gold_request(GOLD_OBJECT_EVENT, + GOLD_ACTION_CREATE); + if(!gold_request) + return rc; + + gold_request_add_assignment(gold_request, "Machine", cluster_name); + snprintf(tmp_buff, sizeof(tmp_buff), "%d", (int)event_time); + gold_request_add_assignment(gold_request, "StartTime", tmp_buff); + gold_request_add_assignment(gold_request, "Name", node_ptr->name); + snprintf(tmp_buff, sizeof(tmp_buff), "%u", node_ptr->cpus); + gold_request_add_assignment(gold_request, "CPUCount", tmp_buff); + if(reason) + gold_request_add_assignment(gold_request, "Reason", reason); + else + gold_request_add_assignment(gold_request, "Reason", + node_ptr->reason); + + gold_response = get_gold_response(gold_request); + destroy_gold_request(gold_request); + + if(!gold_response) { + error("jobacct_p_cluster_procs: no response received"); + return rc; + } + + if(!gold_response->rc) + rc = SLURM_SUCCESS; + else { + error("gold_response has non-zero rc(%d): %s", + gold_response->rc, + gold_response->message); + } + destroy_gold_response(gold_response); + + return rc; +} + +extern int nodeacct_storage_p_node_up(struct node_record *node_ptr, + time_t event_time) +{ + int rc = SLURM_ERROR; + gold_request_t *gold_request = NULL; + gold_response_t *gold_response = NULL; + char tmp_buff[50]; + +#if _DEBUG + slurm_make_time_str(&event_time, tmp_buff, sizeof(tmp_buff)); + info("Node_acct_up: %s at %s", node_ptr->name, tmp_buff); +#endif + /* FIXME: WRITE TO DATABASE HERE */ + + gold_request = create_gold_request(GOLD_OBJECT_EVENT, + GOLD_ACTION_MODIFY); + if(!gold_request) + return rc; + + gold_request_add_condition(gold_request, "Machine", cluster_name, + GOLD_OPERATOR_NONE); + gold_request_add_condition(gold_request, "EndTime", "0", + GOLD_OPERATOR_NONE); + gold_request_add_condition(gold_request, "Name", node_ptr->name, + GOLD_OPERATOR_NONE); + + snprintf(tmp_buff, sizeof(tmp_buff), "%d", ((int)event_time - 1)); + gold_request_add_assignment(gold_request, "EndTime", tmp_buff); + + gold_response = get_gold_response(gold_request); + destroy_gold_request(gold_request); + + if(!gold_response) { + error("jobacct_p_node_up: no response received"); + return rc; + } + + if(gold_response->rc) { + error("gold_response has non-zero rc(%d): %s", + gold_response->rc, + gold_response->message); + destroy_gold_response(gold_response); + return rc; + } + destroy_gold_response(gold_response); + + + return rc; +} + +extern int nodeacct_storage_p_cluster_procs(uint32_t procs, time_t event_time) +{ + static uint32_t last_procs = -1; + gold_request_t *gold_request = NULL; + gold_response_t *gold_response = NULL; + char tmp_buff[50]; + int rc = SLURM_ERROR; + + if (procs == last_procs) { + debug3("we have the same procs as before no need to " + "query the database."); + return SLURM_SUCCESS; + } + last_procs = procs; + + /* Record the processor count */ +#if _DEBUG + slurm_make_time_str(&event_time, tmp_buff, sizeof(tmp_buff)); + info("Node_acct_procs: %s has %u total CPUs at %s", + cluster_name, procs, tmp_buff); +#endif + + /* get the last known one */ + gold_request = create_gold_request(GOLD_OBJECT_EVENT, + GOLD_ACTION_QUERY); + if(!gold_request) + return rc; + gold_request_add_condition(gold_request, "Machine", cluster_name, + GOLD_OPERATOR_NONE); + gold_request_add_condition(gold_request, "EndTime", "0", + GOLD_OPERATOR_NONE); + gold_request_add_condition(gold_request, "Name", "NULL", + GOLD_OPERATOR_NONE); + + gold_request_add_selection(gold_request, "CPUCount"); + + gold_response = get_gold_response(gold_request); + destroy_gold_request(gold_request); + + if(!gold_response) { + error("jobacct_p_cluster_procs: no response received"); + return rc; + } + + if(gold_response->entry_cnt > 0) { + gold_response_entry_t *resp_entry = + list_pop(gold_response->entries); + gold_name_value_t *name_val = list_pop(resp_entry->name_val); + + if(procs == atoi(name_val->value)) { + debug("System hasn't changed since last entry"); + destroy_gold_name_value(name_val); + destroy_gold_response_entry(resp_entry); + destroy_gold_response(gold_response); + return SLURM_SUCCESS; + } else { + debug("System has changed from %s cpus to %d", + name_val->value, procs); + } + + destroy_gold_name_value(name_val); + destroy_gold_response_entry(resp_entry); + } else { + debug("We don't have an entry for this machine " + "most likely a first time running."); + } + + destroy_gold_response(gold_response); + + + + gold_request = create_gold_request(GOLD_OBJECT_EVENT, + GOLD_ACTION_MODIFY); + if(!gold_request) + return rc; + + gold_request_add_condition(gold_request, "Machine", cluster_name, + GOLD_OPERATOR_NONE); + gold_request_add_condition(gold_request, "EndTime", "0", + GOLD_OPERATOR_NONE); + gold_request_add_condition(gold_request, "Name", "NULL", + GOLD_OPERATOR_NONE); + + snprintf(tmp_buff, sizeof(tmp_buff), "%d", ((int)event_time - 1)); + gold_request_add_assignment(gold_request, "EndTime", tmp_buff); + + gold_response = get_gold_response(gold_request); + destroy_gold_request(gold_request); + + if(!gold_response) { + error("jobacct_p_cluster_procs: no response received"); + return rc; + } + + if(gold_response->rc) { + error("gold_response has non-zero rc(%d): %s", + gold_response->rc, + gold_response->message); + destroy_gold_response(gold_response); + return rc; + } + destroy_gold_response(gold_response); + + /* now add the new one */ + gold_request = create_gold_request(GOLD_OBJECT_EVENT, + GOLD_ACTION_CREATE); + if(!gold_request) + return rc; + + gold_request_add_assignment(gold_request, "Machine", cluster_name); + snprintf(tmp_buff, sizeof(tmp_buff), "%d", (int)event_time); + gold_request_add_assignment(gold_request, "StartTime", tmp_buff); + snprintf(tmp_buff, sizeof(tmp_buff), "%u", procs); + gold_request_add_assignment(gold_request, "CPUCount", tmp_buff); + + gold_response = get_gold_response(gold_request); + destroy_gold_request(gold_request); + + if(!gold_response) { + error("jobacct_p_cluster_procs: no response received"); + return rc; + } + + if(!gold_response->rc) + rc = SLURM_SUCCESS; + else { + error("gold_response has non-zero rc(%d): %s", + gold_response->rc, + gold_response->message); + } + destroy_gold_response(gold_response); + + return rc; +} + diff --git a/src/plugins/nodeacct_storage/mysql/Makefile.am b/src/plugins/nodeacct_storage/mysql/Makefile.am new file mode 100644 index 0000000000000000000000000000000000000000..4cac50f3fcf0e4a22974226db9cd4ce862206a40 --- /dev/null +++ b/src/plugins/nodeacct_storage/mysql/Makefile.am @@ -0,0 +1,13 @@ +# Makefile for nodeacct_storage/mysql plugin + +AUTOMAKE_OPTIONS = foreign + +PLUGIN_FLAGS = -module -avoid-version --export-dynamic + +INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common + +pkglib_LTLIBRARIES = nodeacct_storage_mysql.la + +# Null job completion logging plugin. +nodeacct_storage_mysql_la_SOURCES = nodeacct_storage_mysql.c +nodeacct_storage_mysql_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) diff --git a/src/plugins/nodeacct_storage/mysql/nodeacct_storage_none.c b/src/plugins/nodeacct_storage/mysql/nodeacct_storage_none.c new file mode 100644 index 0000000000000000000000000000000000000000..c0a948becaf1f8a46e9ab05c276133fdac13081f --- /dev/null +++ b/src/plugins/nodeacct_storage/mysql/nodeacct_storage_none.c @@ -0,0 +1,118 @@ +/*****************************************************************************\ + * nodeacct_storage_none.c - NO-OP slurm job completion logging plugin. + ***************************************************************************** + * Copyright (C) 2002-2008 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Danny Auble <da@llnl.gov> + * UCRL-CODE-226842. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#if HAVE_STDINT_H +# include <stdint.h> +#endif +#if HAVE_INTTYPES_H +# include <inttypes.h> +#endif + +#include <stdio.h> +#include <slurm/slurm_errno.h> + +#include "src/slurmctld/slurmctld.h" +#include "src/common/slurm_nodeacct_storage.h" + +/* + * These variables are required by the generic plugin interface. If they + * are not found in the plugin, the plugin loader will ignore it. + * + * plugin_name - a string giving a human-readable description of the + * plugin. There is no maximum length, but the symbol must refer to + * a valid string. + * + * plugin_type - a string suggesting the type of the plugin or its + * applicability to a particular form of data or method of data handling. + * If the low-level plugin API is used, the contents of this string are + * unimportant and may be anything. SLURM uses the higher-level plugin + * interface which requires this string to be of the form + * + * <application>/<method> + * + * where <application> is a description of the intended application of + * the plugin (e.g., "jobacct" for SLURM job completion logging) and <method> + * is a description of how this plugin satisfies that application. SLURM will + * only load job completion logging plugins if the plugin_type string has a + * prefix of "jobacct/". + * + * plugin_version - an unsigned 32-bit integer giving the version number + * of the plugin. If major and minor revisions are desired, the major + * version number may be multiplied by a suitable magnitude constant such + * as 100 or 1000. Various SLURM versions will likely require a certain + * minimum versions for their plugins as the job accounting API + * matures. + */ +const char plugin_name[] = "Node accounting storage NOT_INVOKED plugin"; +const char plugin_type[] = "nodeacct_storage/none"; +const uint32_t plugin_version = 100; + +/* + * init() is called when the plugin is loaded, before any other functions + * are called. Put global initialization here. + */ +extern int init ( void ) +{ + verbose("%s loaded", plugin_name); + return SLURM_SUCCESS; +} + +extern int fini ( void ) +{ + return SLURM_SUCCESS; +} + + +extern int nodeacct_storage_p_node_down(struct node_record *node_ptr, + time_t event_time, char *reason) +{ + return SLURM_SUCCESS; +} +extern int nodeacct_storage_p_node_up(struct node_record *node_ptr, + time_t event_time) +{ + return SLURM_SUCCESS; +} +extern int nodeacct_storage_p_cluster_procs(uint32_t procs, time_t event_time) +{ + return SLURM_SUCCESS; +} + diff --git a/src/plugins/nodeacct_storage/none/Makefile.am b/src/plugins/nodeacct_storage/none/Makefile.am new file mode 100644 index 0000000000000000000000000000000000000000..427a53fa695b72174912220d62f55d6cba0b529a --- /dev/null +++ b/src/plugins/nodeacct_storage/none/Makefile.am @@ -0,0 +1,13 @@ +# Makefile for nodeacct_storage/none plugin + +AUTOMAKE_OPTIONS = foreign + +PLUGIN_FLAGS = -module -avoid-version --export-dynamic + +INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common + +pkglib_LTLIBRARIES = nodeacct_storage_none.la + +# Null job completion logging plugin. +nodeacct_storage_none_la_SOURCES = nodeacct_storage_none.c +nodeacct_storage_none_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) diff --git a/src/plugins/nodeacct_storage/none/nodeacct_storage_none.c b/src/plugins/nodeacct_storage/none/nodeacct_storage_none.c new file mode 100644 index 0000000000000000000000000000000000000000..c0a948becaf1f8a46e9ab05c276133fdac13081f --- /dev/null +++ b/src/plugins/nodeacct_storage/none/nodeacct_storage_none.c @@ -0,0 +1,118 @@ +/*****************************************************************************\ + * nodeacct_storage_none.c - NO-OP slurm job completion logging plugin. + ***************************************************************************** + * Copyright (C) 2002-2008 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Danny Auble <da@llnl.gov> + * UCRL-CODE-226842. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#if HAVE_STDINT_H +# include <stdint.h> +#endif +#if HAVE_INTTYPES_H +# include <inttypes.h> +#endif + +#include <stdio.h> +#include <slurm/slurm_errno.h> + +#include "src/slurmctld/slurmctld.h" +#include "src/common/slurm_nodeacct_storage.h" + +/* + * These variables are required by the generic plugin interface. If they + * are not found in the plugin, the plugin loader will ignore it. + * + * plugin_name - a string giving a human-readable description of the + * plugin. There is no maximum length, but the symbol must refer to + * a valid string. + * + * plugin_type - a string suggesting the type of the plugin or its + * applicability to a particular form of data or method of data handling. + * If the low-level plugin API is used, the contents of this string are + * unimportant and may be anything. SLURM uses the higher-level plugin + * interface which requires this string to be of the form + * + * <application>/<method> + * + * where <application> is a description of the intended application of + * the plugin (e.g., "jobacct" for SLURM job completion logging) and <method> + * is a description of how this plugin satisfies that application. SLURM will + * only load job completion logging plugins if the plugin_type string has a + * prefix of "jobacct/". + * + * plugin_version - an unsigned 32-bit integer giving the version number + * of the plugin. If major and minor revisions are desired, the major + * version number may be multiplied by a suitable magnitude constant such + * as 100 or 1000. Various SLURM versions will likely require a certain + * minimum versions for their plugins as the job accounting API + * matures. + */ +const char plugin_name[] = "Node accounting storage NOT_INVOKED plugin"; +const char plugin_type[] = "nodeacct_storage/none"; +const uint32_t plugin_version = 100; + +/* + * init() is called when the plugin is loaded, before any other functions + * are called. Put global initialization here. + */ +extern int init ( void ) +{ + verbose("%s loaded", plugin_name); + return SLURM_SUCCESS; +} + +extern int fini ( void ) +{ + return SLURM_SUCCESS; +} + + +extern int nodeacct_storage_p_node_down(struct node_record *node_ptr, + time_t event_time, char *reason) +{ + return SLURM_SUCCESS; +} +extern int nodeacct_storage_p_node_up(struct node_record *node_ptr, + time_t event_time) +{ + return SLURM_SUCCESS; +} +extern int nodeacct_storage_p_cluster_procs(uint32_t procs, time_t event_time) +{ + return SLURM_SUCCESS; +} + diff --git a/src/plugins/nodeacct_storage/pgsql/Makefile.am b/src/plugins/nodeacct_storage/pgsql/Makefile.am new file mode 100644 index 0000000000000000000000000000000000000000..5bd02524027981b0366f6e98dff651a06c42a607 --- /dev/null +++ b/src/plugins/nodeacct_storage/pgsql/Makefile.am @@ -0,0 +1,13 @@ +# Makefile for nodeacct_storage/pgsql plugin + +AUTOMAKE_OPTIONS = foreign + +PLUGIN_FLAGS = -module -avoid-version --export-dynamic + +INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common + +pkglib_LTLIBRARIES = nodeacct_storage_pgsql.la + +# Null job completion logging plugin. +nodeacct_storage_pgsql_la_SOURCES = nodeacct_storage_pgsql.c +nodeacct_storage_pgsql_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) diff --git a/src/plugins/nodeacct_storage/pgsql/nodeacct_storage_none.c b/src/plugins/nodeacct_storage/pgsql/nodeacct_storage_none.c new file mode 100644 index 0000000000000000000000000000000000000000..c0a948becaf1f8a46e9ab05c276133fdac13081f --- /dev/null +++ b/src/plugins/nodeacct_storage/pgsql/nodeacct_storage_none.c @@ -0,0 +1,118 @@ +/*****************************************************************************\ + * nodeacct_storage_none.c - NO-OP slurm job completion logging plugin. + ***************************************************************************** + * Copyright (C) 2002-2008 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Danny Auble <da@llnl.gov> + * UCRL-CODE-226842. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#if HAVE_STDINT_H +# include <stdint.h> +#endif +#if HAVE_INTTYPES_H +# include <inttypes.h> +#endif + +#include <stdio.h> +#include <slurm/slurm_errno.h> + +#include "src/slurmctld/slurmctld.h" +#include "src/common/slurm_nodeacct_storage.h" + +/* + * These variables are required by the generic plugin interface. If they + * are not found in the plugin, the plugin loader will ignore it. + * + * plugin_name - a string giving a human-readable description of the + * plugin. There is no maximum length, but the symbol must refer to + * a valid string. + * + * plugin_type - a string suggesting the type of the plugin or its + * applicability to a particular form of data or method of data handling. + * If the low-level plugin API is used, the contents of this string are + * unimportant and may be anything. SLURM uses the higher-level plugin + * interface which requires this string to be of the form + * + * <application>/<method> + * + * where <application> is a description of the intended application of + * the plugin (e.g., "jobacct" for SLURM job completion logging) and <method> + * is a description of how this plugin satisfies that application. SLURM will + * only load job completion logging plugins if the plugin_type string has a + * prefix of "jobacct/". + * + * plugin_version - an unsigned 32-bit integer giving the version number + * of the plugin. If major and minor revisions are desired, the major + * version number may be multiplied by a suitable magnitude constant such + * as 100 or 1000. Various SLURM versions will likely require a certain + * minimum versions for their plugins as the job accounting API + * matures. + */ +const char plugin_name[] = "Node accounting storage NOT_INVOKED plugin"; +const char plugin_type[] = "nodeacct_storage/none"; +const uint32_t plugin_version = 100; + +/* + * init() is called when the plugin is loaded, before any other functions + * are called. Put global initialization here. + */ +extern int init ( void ) +{ + verbose("%s loaded", plugin_name); + return SLURM_SUCCESS; +} + +extern int fini ( void ) +{ + return SLURM_SUCCESS; +} + + +extern int nodeacct_storage_p_node_down(struct node_record *node_ptr, + time_t event_time, char *reason) +{ + return SLURM_SUCCESS; +} +extern int nodeacct_storage_p_node_up(struct node_record *node_ptr, + time_t event_time) +{ + return SLURM_SUCCESS; +} +extern int nodeacct_storage_p_cluster_procs(uint32_t procs, time_t event_time) +{ + return SLURM_SUCCESS; +} + diff --git a/src/slurmctld/Makefile.in b/src/slurmctld/Makefile.in index 363c513e1dfeaf522f4319c498d69bbfea2ad350..3433d59ddd12e6e2b6e394c67fd1236db84aa7d4 100644 --- a/src/slurmctld/Makefile.in +++ b/src/slurmctld/Makefile.in @@ -278,7 +278,7 @@ slurmctld_SOURCES = \ controller.c \ job_mgr.c \ job_scheduler.c \ - job_scheduler.h \ + job_scheduler.h \ locks.c \ locks.h \ node_mgr.c \ diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 93ab07c7c1dfcd01b2d4848b9568e3cd28f59e2b..d688adce23fd679637f3c79c71666db20df3e538 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -70,6 +70,7 @@ #include "src/common/read_config.h" #include "src/common/slurm_jobacct_gather.h" #include "src/common/slurm_jobacct_storage.h" +#include "src/common/slurm_nodeacct_storage.h" #include "src/common/slurm_auth.h" #include "src/common/slurm_jobcomp.h" #include "src/common/slurm_protocol_api.h" @@ -102,6 +103,12 @@ * check-in before we ping them */ #define SHUTDOWN_WAIT 2 /* Time to wait for backup server shutdown */ +#if (0) +/* If defined and FastSchedule=0 in slurm.conf, then report the CPU count that a + * node registers with rather than the CPU count defined for the node in slurm.conf */ +#define SLURM_NODE_ACCT_REGISTER 1 +#endif + /**************************************************************************\ * To test for memory leaks, set MEMORY_LEAK_DEBUG to 1 using * "configure --enable-memory-leak-debug" then execute @@ -154,6 +161,8 @@ static int controller_sigarray[] = { static void _default_sigaction(int sig); inline static void _free_server_thread(void); +static int _gold_cluster_ready(); +static int _gold_mark_all_nodes_down(char *reason, time_t event_time); static void _init_config(void); static void _init_pidfile(void); static void _kill_old_slurmctld(void); @@ -308,13 +317,16 @@ int main(int argc, char *argv[]) (void) _shutdown_backup_controller(SHUTDOWN_WAIT); /* Now recover the remaining state information */ if (switch_restore(slurmctld_conf.state_save_location, - recover ? true : false)) + recover ? true : false)) fatal(" failed to initialize switch plugin" ); if ((error_code = read_slurm_conf(recover))) { fatal("read_slurm_conf reading %s: %s", slurmctld_conf.slurm_conf, slurm_strerror(error_code)); } + if (recover == 0) + _gold_mark_all_nodes_down("cold-start", + time(NULL)); } else { error("this host (%s) not valid controller (%s or %s)", node_name, slurmctld_conf.control_machine, @@ -322,6 +334,7 @@ int main(int argc, char *argv[]) exit(0); } info("Running as primary controller"); + _gold_cluster_ready(); if (slurm_sched_init() != SLURM_SUCCESS) fatal("failed to initialize scheduling plugin"); @@ -783,6 +796,62 @@ static void _free_server_thread(void) pthread_cond_broadcast(&server_thread_cond); } +static int _gold_cluster_ready() +{ + uint32_t procs = 0; + struct node_record *node_ptr; + int i; + int rc = SLURM_ERROR; + time_t event_time = time(NULL); + + node_ptr = node_record_table_ptr; + for (i = 0; i < node_record_count; i++, node_ptr++) { + if (node_ptr->name == '\0') + continue; +#ifdef SLURM_NODE_ACCT_REGISTER + if (slurmctld_conf.fast_schedule) + procs += node_ptr->config_ptr->cpus; + else + procs += node_ptr->cpus; +#else + procs += node_ptr->config_ptr->cpus; +#endif + } + + rc = nodeacct_storage_g_cluster_procs(procs, event_time); + + return rc; +} + +static int _gold_mark_all_nodes_down(char *reason, time_t event_time) +{ + char *state_file; + struct stat stat_buf; + struct node_record *node_ptr; + int i; + int rc = SLURM_ERROR; + + state_file = xstrdup (slurmctld_conf.state_save_location); + xstrcat (state_file, "/node_state"); + if (stat(state_file, &stat_buf)) { + error("_gold_mark_all_nodes_down: could not stat(%s) to record " + "node down time", state_file); + xfree(state_file); + return rc; + } + xfree(state_file); + + node_ptr = node_record_table_ptr; + for (i = 0; i < node_record_count; i++, node_ptr++) { + if (node_ptr->name == '\0') + continue; + if((rc = nodeacct_storage_g_node_down(node_ptr, event_time, + reason)) + == SLURM_ERROR) + break; + } + return rc; +} /* * _slurmctld_background - process slurmctld background activities * purge defunct job records, save state, schedule jobs, and @@ -799,6 +868,7 @@ static void *_slurmctld_background(void *no_data) static time_t last_timelimit_time; static time_t last_assert_primary_time; static time_t last_trigger; + static time_t last_node_acct; time_t now; int ping_interval; DEF_TIMERS; @@ -816,6 +886,9 @@ static void *_slurmctld_background(void *no_data) * (Might kill jobs on nodes set DOWN) */ slurmctld_lock_t node_write_lock = { READ_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; + /* Locks: Read node */ + slurmctld_lock_t node_read_lock = { + NO_LOCK, NO_LOCK, READ_LOCK, NO_LOCK }; /* Locks: Write partition */ slurmctld_lock_t part_write_lock = { NO_LOCK, NO_LOCK, NO_LOCK, WRITE_LOCK }; @@ -834,6 +907,7 @@ static void *_slurmctld_background(void *no_data) ping_interval = 60 * 60 * 24 * 356; /* one year */ last_ping_node_time = now + (time_t)MIN_CHECKIN_TIME - ping_interval; last_ping_srun_time = now; + last_node_acct = now; (void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); (void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); debug3("_slurmctld_background pid = %u", getpid()); @@ -934,6 +1008,14 @@ static void *_slurmctld_background(void *no_data) save_all_state(); } + if (difftime(now, last_node_acct) >= PERIODIC_NODE_ACCT) { + /* Report current node state to account for added + * or reconfigured nodes */ + last_node_acct = now; + lock_slurmctld(node_read_lock); + _gold_cluster_ready(); + unlock_slurmctld(node_read_lock); + } /* Reassert this machine as the primary controller. * A network or security problem could result in * the backup controller assuming control even diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 890c0190f7744b7c2e389bbf9d4297ab47e91a28..bc7ba49bea4b86ab9a0d17dd07a1b11fe2954d35 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -1080,13 +1080,13 @@ extern int kill_running_job_by_node_name(char *node_name, bool step_test) difftime(now, job_ptr->suspend_time); } else job_ptr->end_time = now; - deallocate_nodes(job_ptr, false, suspended); - + /* We want this job to look like it was cancelled in the * accounting logs. Set a new submit time so the restarted * job looks like a new job. */ save_state = job_ptr->job_state; job_ptr->job_state = JOB_CANCELLED; + deallocate_nodes(job_ptr, false, suspended); job_completion_logger(job_ptr); job_ptr->job_state = save_state; job_ptr->details->submit_time = now; @@ -1105,8 +1105,8 @@ extern int kill_running_job_by_node_name(char *node_name, bool step_test) difftime(now, job_ptr->suspend_time); } else job_ptr->end_time = time(NULL); - job_completion_logger(job_ptr); deallocate_nodes(job_ptr, false, suspended); + job_completion_logger(job_ptr); } } @@ -1378,7 +1378,6 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, job_ptr->exit_code = 1; job_ptr->state_reason = FAIL_BAD_CONSTRAINTS; job_ptr->start_time = job_ptr->end_time = now; - jobacct_storage_g_job_start(job_ptr); job_completion_logger(job_ptr); } return error_code; @@ -1411,7 +1410,6 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, job_ptr->exit_code = 1; job_ptr->state_reason = FAIL_BAD_CONSTRAINTS; job_ptr->start_time = job_ptr->end_time = now; - jobacct_storage_g_job_start(job_ptr); job_completion_logger(job_ptr); if (!independent) return ESLURM_DEPENDENCY; @@ -1426,7 +1424,6 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, no_alloc = test_only || too_fragmented || (!top_prio) || (!independent); error_code = select_nodes(job_ptr, no_alloc, NULL); - jobacct_storage_g_job_start(job_ptr); if (!test_only) { last_job_update = now; slurm_sched_schedule(); /* work for external scheduler */ @@ -2016,6 +2013,7 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run, job_ptr->state_reason = fail_reason; } + cleanup: FREE_NULL_BITMAP(req_bitmap); FREE_NULL_BITMAP(exc_bitmap); @@ -4516,8 +4514,6 @@ extern void job_completion_logger(struct job_record *job_ptr) mail_job_info(job_ptr, MAIL_JOB_FAIL); } - /* write out to logs */ - jobacct_storage_g_job_complete(job_ptr); g_slurm_jobcomp_write(job_ptr); } @@ -4938,14 +4934,14 @@ extern int job_requeue (uid_t uid, uint32_t job_id, slurm_fd conn_fd) job_ptr->end_time = job_ptr->suspend_time; else job_ptr->end_time = now; - deallocate_nodes(job_ptr, false, suspended); - xfree(job_ptr->details->req_node_layout); /* We want this job to look like it was cancelled in the * accounting logs. Set a new submit time so the restarted * job looks like a new job. */ save_state = job_ptr->job_state; job_ptr->job_state = JOB_CANCELLED; + deallocate_nodes(job_ptr, false, suspended); + xfree(job_ptr->details->req_node_layout); job_completion_logger(job_ptr); job_ptr->job_state = save_state; job_ptr->details->submit_time = now; diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index c7a94e4e2a8a75ec0a49727946217fe73268278c..9c71f9430b1d7a7956e97aac5d3ba8f24cd4b152 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -62,6 +62,7 @@ #include "src/common/xstring.h" #include "src/common/node_select.h" #include "src/common/read_config.h" +#include "src/common/slurm_nodeacct_storage.h" #include "src/slurmctld/agent.h" #include "src/slurmctld/locks.h" #include "src/slurmctld/ping_nodes.h" @@ -96,7 +97,8 @@ static struct node_record * _find_alias_node_record (char *name); static int _hash_index (char *name); static void _list_delete_config (void *config_entry); static int _list_find_config (void *config_entry, void *key); -static void _make_node_down(struct node_record *node_ptr); +static void _make_node_down(struct node_record *node_ptr, + time_t event_time); static void _node_did_resp(struct node_record *node_ptr); static bool _node_is_hidden(struct node_record *node_ptr); static void _node_not_resp (struct node_record *node_ptr, time_t msg_time); @@ -989,11 +991,11 @@ void set_slurmd_addr (void) */ int update_node ( update_node_msg_t * update_node_msg ) { - int error_code = 0, base_state = 0, node_inx; + int error_code = 0, node_inx; struct node_record *node_ptr = NULL; char *this_node_name = NULL; hostlist_t host_list; - uint16_t node_flags = 0, state_val; + uint16_t base_state = 0, node_flags = 0, state_val; time_t now = time(NULL); if (update_node_msg -> node_names == NULL ) { @@ -1023,6 +1025,14 @@ int update_node ( update_node_msg_t * update_node_msg ) break; } + if ((update_node_msg -> reason) && + (update_node_msg -> reason[0])) { + xfree(node_ptr->reason); + node_ptr->reason = xstrdup(update_node_msg->reason); + info ("update_node: node %s reason set to: %s", + this_node_name, node_ptr->reason); + } + if (state_val != (uint16_t) NO_VAL) { base_state = node_ptr->node_state; if (!_valid_node_state_change(base_state, state_val)) { @@ -1037,6 +1047,12 @@ int update_node ( update_node_msg_t * update_node_msg ) } if (state_val != (uint16_t) NO_VAL) { if (state_val == NODE_RESUME) { + base_state &= NODE_STATE_BASE; + if ((base_state == NODE_STATE_IDLE) && + ((node_ptr->node_state & NODE_STATE_DRAIN) || + (node_ptr->node_state & NODE_STATE_FAIL))) { + nodeacct_storage_g_node_up(node_ptr, now); + } node_ptr->node_state &= (~NODE_STATE_DRAIN); node_ptr->node_state &= (~NODE_STATE_FAIL); base_state &= NODE_STATE_BASE; @@ -1048,13 +1064,22 @@ int update_node ( update_node_msg_t * update_node_msg ) if (state_val == NODE_STATE_DOWN) { /* We must set node DOWN before killing * its jobs */ - _make_node_down(node_ptr); + _make_node_down(node_ptr, now); kill_running_job_by_node_name (this_node_name, false); } else if (state_val == NODE_STATE_IDLE) { /* assume they want to clear DRAIN and * FAIL flags too */ + base_state &= NODE_STATE_BASE; + if (base_state == NODE_STATE_DOWN) { + trigger_node_up(node_ptr); + nodeacct_storage_g_node_up(node_ptr, now); + } else if ((base_state == NODE_STATE_IDLE) && + ((node_ptr->node_state & NODE_STATE_DRAIN) || + (node_ptr->node_state & NODE_STATE_FAIL))) { + nodeacct_storage_g_node_up(node_ptr, now); + } node_ptr->node_state &= (~NODE_STATE_DRAIN); node_ptr->node_state &= (~NODE_STATE_FAIL); bit_set (avail_node_bitmap, node_inx); @@ -1074,12 +1099,20 @@ int update_node ( update_node_msg_t * update_node_msg ) bit_clear (avail_node_bitmap, node_inx); state_val = node_ptr->node_state | NODE_STATE_DRAIN; + if ((node_ptr->run_job_cnt == 0) && + (node_ptr->comp_job_cnt == 0)) + nodeacct_storage_g_node_down(node_ptr, now, + NULL); } else if (state_val == NODE_STATE_FAIL) { bit_clear (avail_node_bitmap, node_inx); state_val = node_ptr->node_state | NODE_STATE_FAIL; trigger_node_failing(node_ptr); + if ((node_ptr->run_job_cnt == 0) && + (node_ptr->comp_job_cnt == 0)) + nodeacct_storage_g_node_down(node_ptr, now, + NULL); } else { info ("Invalid node state specified %d", @@ -1102,14 +1135,6 @@ int update_node ( update_node_msg_t * update_node_msg ) } } - if ((update_node_msg -> reason) && - (update_node_msg -> reason[0])) { - xfree(node_ptr->reason); - node_ptr->reason = xstrdup(update_node_msg->reason); - info ("update_node: node %s reason set to: %s", - this_node_name, node_ptr->reason); - } - base_state = node_ptr->node_state & NODE_STATE_BASE; if ((base_state != NODE_STATE_DOWN) && ((node_ptr->node_state & (NODE_STATE_DRAIN | @@ -1280,6 +1305,7 @@ extern int drain_nodes ( char *nodes, char *reason ) struct node_record *node_ptr; char *this_node_name ; hostlist_t host_list; + time_t now = time(NULL); if ((nodes == NULL) || (nodes[0] == '\0')) { error ("drain_nodes: invalid node name %s", nodes); @@ -1316,6 +1342,11 @@ extern int drain_nodes ( char *nodes, char *reason ) xfree(node_ptr->reason); node_ptr->reason = xstrdup(reason); + if ((node_ptr->run_job_cnt == 0) && + (node_ptr->comp_job_cnt == 0)) { + /* no jobs, node is drained */ + nodeacct_storage_g_node_down(node_ptr, now, NULL); + } select_g_update_node_state(node_inx, node_ptr->node_state); @@ -1518,6 +1549,7 @@ validate_node_specs (char *node_name, uint16_t cpus, node_ptr->last_idle = now; } xfree(node_ptr->reason); + nodeacct_storage_g_node_up(node_ptr, now); } else if ((base_state == NODE_STATE_DOWN) && (slurmctld_conf.ret2service == 1) && (node_ptr->reason != NULL) && @@ -1536,6 +1568,7 @@ validate_node_specs (char *node_name, uint16_t cpus, xfree(node_ptr->reason); reset_job_priority(); trigger_node_up(node_ptr); + nodeacct_storage_g_node_up(node_ptr, now); } else if ((base_state == NODE_STATE_ALLOCATED) && (job_count == 0)) { /* job vanished */ last_node_update = now; @@ -1710,6 +1743,9 @@ extern int validate_nodes_via_front_end(uint32_t job_count, node_ptr->last_idle = now; } xfree(node_ptr->reason); + if ((node_flags & + (NODE_STATE_DRAIN | NODE_STATE_FAIL)) == 0) + nodeacct_storage_g_node_up(node_ptr, now); } else if ((base_state == NODE_STATE_DOWN) && (slurmctld_conf.ret2service == 1)) { updated_job = true; @@ -1731,6 +1767,7 @@ extern int validate_nodes_via_front_end(uint32_t job_count, node_ptr->name); xfree(node_ptr->reason); trigger_node_up(node_ptr); + nodeacct_storage_g_node_up(node_ptr, now); } else if ((base_state == NODE_STATE_ALLOCATED) && (jobs_on_node == 0)) { /* job vanished */ updated_job = true; @@ -1848,6 +1885,8 @@ static void _node_did_resp(struct node_record *node_ptr) last_node_update = now; node_ptr->last_idle = now; node_ptr->node_state = NODE_STATE_IDLE | node_flags; + if ((node_flags & (NODE_STATE_DRAIN | NODE_STATE_FAIL)) == 0) + nodeacct_storage_g_node_up(node_ptr, now); } if ((base_state == NODE_STATE_DOWN) && (slurmctld_conf.ret2service == 1) && @@ -1859,6 +1898,9 @@ static void _node_did_resp(struct node_record *node_ptr) info("node_did_resp: node %s returned to service", node_ptr->name); xfree(node_ptr->reason); + trigger_node_up(node_ptr); + if ((node_flags & (NODE_STATE_DRAIN | NODE_STATE_FAIL)) == 0) + nodeacct_storage_g_node_up(node_ptr, now); } base_state = node_ptr->node_state & NODE_STATE_BASE; if ((base_state == NODE_STATE_IDLE) @@ -1939,6 +1981,7 @@ static void _node_not_resp (struct node_record *node_ptr, time_t msg_time) void set_node_down (char *name, char *reason) { struct node_record *node_ptr; + time_t now = time(NULL); node_ptr = find_node_record (name); if (node_ptr == NULL) { @@ -1946,7 +1989,6 @@ void set_node_down (char *name, char *reason) return; } - _make_node_down(node_ptr); (void) kill_running_job_by_node_name(name, false); if ((node_ptr->reason == NULL) || (strncmp(node_ptr->reason, "Not responding", 14) == 0)) { @@ -1961,6 +2003,7 @@ void set_node_down (char *name, char *reason) node_ptr->reason = xstrdup(reason); xstrcat(node_ptr->reason, time_buf); } + _make_node_down(node_ptr, now); return; } @@ -2157,6 +2200,9 @@ extern void make_node_comp(struct node_record *node_ptr, if ((node_ptr->run_job_cnt == 0) && (node_ptr->comp_job_cnt == 0)) { bit_set(idle_node_bitmap, inx); + if ((node_ptr->node_state & NODE_STATE_DRAIN) || + (node_ptr->node_state & NODE_STATE_FAIL)) + nodeacct_storage_g_node_down(node_ptr, now, NULL); } if (base_state == NODE_STATE_DOWN) { @@ -2171,11 +2217,11 @@ extern void make_node_comp(struct node_record *node_ptr, } /* _make_node_down - flag specified node as down */ -static void _make_node_down(struct node_record *node_ptr) +static void _make_node_down(struct node_record *node_ptr, time_t event_time) { int inx = node_ptr - node_record_table_ptr; uint16_t node_flags; - + xassert(node_ptr); last_node_update = time (NULL); node_flags = node_ptr->node_state & NODE_STATE_FLAGS; @@ -2187,6 +2233,7 @@ static void _make_node_down(struct node_record *node_ptr) bit_clear (up_node_bitmap, inx); select_g_update_node_state(inx, node_ptr->node_state); trigger_node_down(node_ptr); + nodeacct_storage_g_node_down(node_ptr, event_time, NULL); } /* @@ -2259,6 +2306,7 @@ void make_node_idle(struct node_record *node_ptr, debug3("make_node_idle: Node %s is DRAINED", node_ptr->name); node_ptr->last_idle = now; + nodeacct_storage_g_node_down(node_ptr, now, NULL); } else if (node_ptr->run_job_cnt) { node_ptr->node_state = NODE_STATE_ALLOCATED | node_flags; } else { diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 6c06d4d896df22add634d67cd198d4ec699d3ba3..037bae5e045c8f09819f4fb2dc236ff1826b6edd 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -62,6 +62,7 @@ #include "src/common/xassert.h" #include "src/common/xmalloc.h" #include "src/common/xstring.h" +#include "src/common/slurm_jobacct_storage.h" #include "src/slurmctld/agent.h" #include "src/slurmctld/node_scheduler.h" @@ -153,7 +154,7 @@ extern void deallocate_nodes(struct job_record *job_ptr, bool timeout, if (select_g_job_fini(job_ptr) != SLURM_SUCCESS) error("select_g_job_fini(%u): %m", job_ptr->job_id); - + agent_args = xmalloc(sizeof(agent_arg_t)); if (timeout) agent_args->msg_type = REQUEST_KILL_TIMELIMIT; @@ -195,6 +196,7 @@ extern void deallocate_nodes(struct job_record *job_ptr, bool timeout, delete_step_records(job_ptr, 1); slurm_sched_schedule(); } + if (agent_args->node_count == 0) { error("Job %u allocated no nodes to be killed on", job_ptr->job_id); @@ -204,6 +206,11 @@ extern void deallocate_nodes(struct job_record *job_ptr, bool timeout, xfree(agent_args); return; } + + /* log this in the accounting plugin since it was allocated + * something */ + jobacct_storage_g_job_complete(job_ptr); + agent_args->msg_args = kill_job; agent_queue_request(agent_args); return; @@ -971,6 +978,8 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, if (job_ptr->mail_type & MAIL_JOB_BEGIN) mail_job_info(job_ptr, MAIL_JOB_BEGIN); + jobacct_storage_g_job_start(job_ptr); + cleanup: if (select_node_bitmap) *select_node_bitmap = select_bitmap; diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 672273bbf226f0a161c8e219e27822f2fd4ef053..67ff808cdff03d512296f093f72211bd15e02b6a 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -119,6 +119,9 @@ /* Process pending trigger events every TRIGGER_INTERVAL seconds */ #define TRIGGER_INTERVAL 15 +/* Report current node accounting state every PERIODIC_NODE_ACCT seconds */ +#define PERIODIC_NODE_ACCT 300 + /* Pathname of group file record for checking update times */ #define GROUP_FILE "/etc/group" @@ -223,6 +226,7 @@ extern time_t last_node_update; /* time of last node record update */ extern int node_record_count; /* count in node_record_table_ptr */ extern bitstr_t *avail_node_bitmap; /* bitmap of available nodes, * not DOWN, DRAINED or DRAINING */ +extern uint32_t total_cpus; /* count of CPUs in the entire cluster */ extern bitstr_t *idle_node_bitmap; /* bitmap of idle nodes */ extern bitstr_t *share_node_bitmap; /* bitmap of sharable nodes */ extern bitstr_t *up_node_bitmap; /* bitmap of up nodes, not DOWN */