Skip to content
Snippets Groups Projects
Commit a2aee97f authored by David Gloe's avatar David Gloe Committed by Danny Auble
Browse files

CRAY - more cleanup

parent 4e5b66d0
No related branches found
No related tags found
No related merge requests found
...@@ -44,12 +44,9 @@ ...@@ -44,12 +44,9 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <signal.h> #include <signal.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <string.h> #include <string.h>
#include <unistd.h> #include <unistd.h>
#include <errno.h> #include <errno.h>
#include <dirent.h>
#include <inttypes.h> #include <inttypes.h>
#include <fcntl.h> #include <fcntl.h>
#include "limits.h" #include "limits.h"
...@@ -114,8 +111,7 @@ int init(void) ...@@ -114,8 +111,7 @@ int init(void)
debug_flags = slurm_get_debug_flags(); debug_flags = slurm_get_debug_flags();
#ifdef HAVE_NATIVE_CRAY #ifdef HAVE_NATIVE_CRAY
if (MAX_PORT < MIN_PORT) { if (MAX_PORT < MIN_PORT) {
error("(%s: %d: %s) MAX_PORT: %d < MIN_PORT: %d", CRAY_ERR("MAX_PORT: %d < MIN_PORT: %d", MAX_PORT, MIN_PORT);
THIS_FILE, __LINE__, __FUNCTION__, MAX_PORT, MIN_PORT);
return SLURM_ERROR; return SLURM_ERROR;
} }
#endif #endif
...@@ -184,7 +180,7 @@ static void _state_read_buf(Buf buffer) ...@@ -184,7 +180,7 @@ static void _state_read_buf(Buf buffer)
return; return;
unpack_error: unpack_error:
error("(%s: %d: %s) unpack error", THIS_FILE, __LINE__, __FUNCTION__); CRAY_ERR("unpack error");
return; return;
} }
...@@ -215,10 +211,8 @@ int switch_p_libstate_save(char *dir_name) ...@@ -215,10 +211,8 @@ int switch_p_libstate_save(char *dir_name)
xassert(dir_name != NULL); xassert(dir_name != NULL);
if (debug_flags & DEBUG_FLAG_SWITCH) { if (debug_flags & DEBUG_FLAG_SWITCH)
info("(%s: %d: %s) save to %s", CRAY_INFO("save to %s", dir_name);
THIS_FILE, __LINE__, __FUNCTION__, dir_name);
}
buffer = init_buf(SWITCH_BUF_SIZE); buffer = init_buf(SWITCH_BUF_SIZE);
_state_write_buf(buffer); _state_write_buf(buffer);
...@@ -227,7 +221,7 @@ int switch_p_libstate_save(char *dir_name) ...@@ -227,7 +221,7 @@ int switch_p_libstate_save(char *dir_name)
(void) unlink(file_name); (void) unlink(file_name);
state_fd = creat(file_name, 0600); state_fd = creat(file_name, 0600);
if (state_fd < 0) { if (state_fd < 0) {
error("Can't save state, error creating file %s %m", CRAY_ERR("Can't save state, error creating file %s %m",
file_name); file_name);
ret = SLURM_ERROR; ret = SLURM_ERROR;
} else { } else {
...@@ -240,7 +234,7 @@ int switch_p_libstate_save(char *dir_name) ...@@ -240,7 +234,7 @@ int switch_p_libstate_save(char *dir_name)
if (wrote == 0) if (wrote == 0)
break; break;
if (wrote < 0) { if (wrote < 0) {
error("Can't save switch state: %m"); CRAY_ERR("Can't save switch state: %m");
ret = SLURM_ERROR; ret = SLURM_ERROR;
break; break;
} }
...@@ -271,9 +265,8 @@ int switch_p_libstate_restore(char *dir_name, bool recover) ...@@ -271,9 +265,8 @@ int switch_p_libstate_restore(char *dir_name, bool recover)
xassert(dir_name != NULL); xassert(dir_name != NULL);
if (debug_flags & DEBUG_FLAG_SWITCH) { if (debug_flags & DEBUG_FLAG_SWITCH) {
info("(%s: %d: %s) restore from %s, recover %d", CRAY_INFO("restore from %s, recover %d",
THIS_FILE, __LINE__, __FUNCTION__, dir_name, dir_name, (int) recover);
(int) recover);
} }
if (!recover) /* clean start, no recovery */ if (!recover) /* clean start, no recovery */
...@@ -291,7 +284,7 @@ int switch_p_libstate_restore(char *dir_name, bool recover) ...@@ -291,7 +284,7 @@ int switch_p_libstate_restore(char *dir_name, bool recover)
if ((data_read < 0) && (errno == EINTR)) if ((data_read < 0) && (errno == EINTR))
continue; continue;
if (data_read < 0) { if (data_read < 0) {
error ("Read error on %s, %m", file_name); CRAY_ERR("Read error on %s, %m", file_name);
error_code = SLURM_ERROR; error_code = SLURM_ERROR;
break; break;
} else if (data_read == 0) } else if (data_read == 0)
...@@ -304,8 +297,9 @@ int switch_p_libstate_restore(char *dir_name, bool recover) ...@@ -304,8 +297,9 @@ int switch_p_libstate_restore(char *dir_name, bool recover)
(void) unlink(file_name); /* One chance to recover */ (void) unlink(file_name); /* One chance to recover */
xfree(file_name); xfree(file_name);
} else { } else {
error("No %s file for switch/cray state recovery", file_name); CRAY_ERR("No %s file for switch/cray state recovery",
error("Starting switch/cray with clean state"); file_name);
CRAY_ERR("Starting switch/cray with clean state");
xfree(file_name); xfree(file_name);
return SLURM_SUCCESS; return SLURM_SUCCESS;
} }
...@@ -378,8 +372,7 @@ int switch_p_build_jobinfo(switch_jobinfo_t *switch_job, ...@@ -378,8 +372,7 @@ int switch_p_build_jobinfo(switch_jobinfo_t *switch_job,
slurm_cray_jobinfo_t *job = (slurm_cray_jobinfo_t *) switch_job; slurm_cray_jobinfo_t *job = (slurm_cray_jobinfo_t *) switch_job;
if (!job || (job->magic == CRAY_NULL_JOBINFO_MAGIC)) { if (!job || (job->magic == CRAY_NULL_JOBINFO_MAGIC)) {
debug2("(%s: %d: %s) switch_job was NULL", THIS_FILE, __LINE__, CRAY_DEBUG("switch_job was NULL");
__FUNCTION__);
return SLURM_SUCCESS; return SLURM_SUCCESS;
} }
...@@ -388,15 +381,13 @@ int switch_p_build_jobinfo(switch_jobinfo_t *switch_job, ...@@ -388,15 +381,13 @@ int switch_p_build_jobinfo(switch_jobinfo_t *switch_job,
rc = list_str_to_array(step_layout->node_list, &cnt, &nodes); rc = list_str_to_array(step_layout->node_list, &cnt, &nodes);
if (rc < 0) { if (rc < 0) {
error("(%s: %d: %s) list_str_to_array failed", CRAY_ERR("list_str_to_array failed");
THIS_FILE, __LINE__, __FUNCTION__);
return SLURM_ERROR; return SLURM_ERROR;
} }
if (step_layout->node_cnt != cnt) { if (step_layout->node_cnt != cnt) {
error("(%s: %d: %s) list_str_to_array returned count %" CRAY_ERR("list_str_to_array returned count %"
PRIu32 "does not match expected count %d", PRIu32 "does not match expected count %d",
THIS_FILE, __LINE__, cnt, step_layout->node_cnt);
__FUNCTION__, cnt, step_layout->node_cnt);
} }
/* /*
...@@ -427,7 +418,7 @@ int switch_p_build_jobinfo(switch_jobinfo_t *switch_job, ...@@ -427,7 +418,7 @@ int switch_p_build_jobinfo(switch_jobinfo_t *switch_job,
/* /*
* Cookie ID safety check: The cookie_ids should be positive numbers. * Cookie ID safety check: The cookie_ids should be positive numbers.
*/ */
for (i=0; i<num_cookies; i++) { for (i = 0; i < num_cookies; i++) {
if (cookie_ids[i] < 0) { if (cookie_ids[i] < 0) {
CRAY_ERR("alpsc_lease_cookies returned a cookie ID " CRAY_ERR("alpsc_lease_cookies returned a cookie ID "
"number %d with a negative value: %d", "number %d with a negative value: %d",
...@@ -459,8 +450,7 @@ int switch_p_build_jobinfo(switch_jobinfo_t *switch_job, ...@@ -459,8 +450,7 @@ int switch_p_build_jobinfo(switch_jobinfo_t *switch_job,
*/ */
rc = assign_port(&port); rc = assign_port(&port);
if (rc < 0) { if (rc < 0) {
info("(%s: %d: %s) assign_port failed", CRAY_INFO("assign_port failed");
THIS_FILE, __LINE__, __FUNCTION__);
return SLURM_ERROR; return SLURM_ERROR;
} }
...@@ -485,13 +475,12 @@ void switch_p_free_jobinfo(switch_jobinfo_t *switch_job) ...@@ -485,13 +475,12 @@ void switch_p_free_jobinfo(switch_jobinfo_t *switch_job)
int i; int i;
if (!job || (job->magic == CRAY_NULL_JOBINFO_MAGIC)) { if (!job || (job->magic == CRAY_NULL_JOBINFO_MAGIC)) {
debug2("(%s: %d: %s) switch_job was NULL", THIS_FILE, __LINE__, CRAY_DEBUG("switch_job was NULL");
__FUNCTION__);
return; return;
} }
if (job->magic != CRAY_JOBINFO_MAGIC) { if (job->magic != CRAY_JOBINFO_MAGIC) {
error("job is not a switch/cray slurm_cray_jobinfo_t"); CRAY_ERR("job is not a switch/cray slurm_cray_jobinfo_t");
return; return;
} }
...@@ -544,8 +533,7 @@ int switch_p_pack_jobinfo(switch_jobinfo_t *switch_job, Buf buffer, ...@@ -544,8 +533,7 @@ int switch_p_pack_jobinfo(switch_jobinfo_t *switch_job, Buf buffer,
xassert(job->magic == CRAY_JOBINFO_MAGIC); xassert(job->magic == CRAY_JOBINFO_MAGIC);
if (debug_flags & DEBUG_FLAG_SWITCH) { if (debug_flags & DEBUG_FLAG_SWITCH) {
info("(%s: %d: %s) switch_jobinfo_t contents", CRAY_INFO("switch_jobinfo_t contents:");
THIS_FILE, __LINE__, __FUNCTION__);
print_jobinfo(job); print_jobinfo(job);
} }
...@@ -565,8 +553,7 @@ int switch_p_unpack_jobinfo(switch_jobinfo_t *switch_job, Buf buffer, ...@@ -565,8 +553,7 @@ int switch_p_unpack_jobinfo(switch_jobinfo_t *switch_job, Buf buffer,
slurm_cray_jobinfo_t *job; slurm_cray_jobinfo_t *job;
if (!switch_job) { if (!switch_job) {
debug2("(%s: %d: %s) switch_job was NULL", THIS_FILE, __LINE__, CRAY_DEBUG("switch_job was NULL");
__FUNCTION__);
return SLURM_SUCCESS; return SLURM_SUCCESS;
} }
...@@ -577,8 +564,7 @@ int switch_p_unpack_jobinfo(switch_jobinfo_t *switch_job, Buf buffer, ...@@ -577,8 +564,7 @@ int switch_p_unpack_jobinfo(switch_jobinfo_t *switch_job, Buf buffer,
safe_unpack32(&job->magic, buffer); safe_unpack32(&job->magic, buffer);
if (job->magic == CRAY_NULL_JOBINFO_MAGIC) { if (job->magic == CRAY_NULL_JOBINFO_MAGIC) {
debug2("(%s: %d: %s) Nothing to unpack.", CRAY_DEBUG("Nothing to unpack");
THIS_FILE, __LINE__, __FUNCTION__);
return SLURM_SUCCESS; return SLURM_SUCCESS;
} }
...@@ -586,18 +572,16 @@ int switch_p_unpack_jobinfo(switch_jobinfo_t *switch_job, Buf buffer, ...@@ -586,18 +572,16 @@ int switch_p_unpack_jobinfo(switch_jobinfo_t *switch_job, Buf buffer,
safe_unpack32(&(job->num_cookies), buffer); safe_unpack32(&(job->num_cookies), buffer);
safe_unpackstr_array(&(job->cookies), &num_cookies, buffer); safe_unpackstr_array(&(job->cookies), &num_cookies, buffer);
if (num_cookies != job->num_cookies) { if (num_cookies != job->num_cookies) {
error("(%s: %d: %s) Wrong number of cookies received." CRAY_ERR("Wrong number of cookies received."
" Expected: %" PRIu32 "Received: %" PRIu32, " Expected: %" PRIu32 "Received: %" PRIu32,
THIS_FILE, __LINE__, __FUNCTION__, job->num_cookies, num_cookies);
job->num_cookies, num_cookies);
goto unpack_error; goto unpack_error;
} }
safe_unpack32_array(&(job->cookie_ids), &num_cookies, buffer); safe_unpack32_array(&(job->cookie_ids), &num_cookies, buffer);
if (num_cookies != job->num_cookies) { if (num_cookies != job->num_cookies) {
error("(%s: %d: %s) Wrong number of cookie IDs received." CRAY_ERR("Wrong number of cookie IDs received."
" Expected: %" PRIu32 "Received: %" PRIu32, " Expected: %" PRIu32 "Received: %" PRIu32,
THIS_FILE, __LINE__, __FUNCTION__, job->num_cookies, num_cookies);
job->num_cookies, num_cookies);
goto unpack_error; goto unpack_error;
} }
safe_unpack32(&job->port, buffer); safe_unpack32(&job->port, buffer);
...@@ -609,17 +593,14 @@ int switch_p_unpack_jobinfo(switch_jobinfo_t *switch_job, Buf buffer, ...@@ -609,17 +593,14 @@ int switch_p_unpack_jobinfo(switch_jobinfo_t *switch_job, Buf buffer,
#endif #endif
if (debug_flags & DEBUG_FLAG_SWITCH) { if (debug_flags & DEBUG_FLAG_SWITCH) {
info("(%s:%d: %s) switch_jobinfo_t contents:", CRAY_INFO("switch_jobinfo_t contents:");
THIS_FILE, __LINE__, __FUNCTION__);
print_jobinfo(job); print_jobinfo(job);
} }
return SLURM_SUCCESS; return SLURM_SUCCESS;
unpack_error: unpack_error:
error("(%s:%d: %s) Unpacking error", THIS_FILE, __LINE__, CRAY_ERR("Unpacking error");
__FUNCTION__);
if (job->num_cookies) { if (job->num_cookies) {
// Free the cookie_ids // Free the cookie_ids
if (job->cookie_ids) if (job->cookie_ids)
...@@ -700,10 +681,9 @@ extern int switch_p_job_init(stepd_step_rec_t *job) ...@@ -700,10 +681,9 @@ extern int switch_p_job_init(stepd_step_rec_t *job)
xassert(sw_job->magic == CRAY_JOBINFO_MAGIC); xassert(sw_job->magic == CRAY_JOBINFO_MAGIC);
if (debug_flags & DEBUG_FLAG_SWITCH) { if (debug_flags & DEBUG_FLAG_SWITCH) {
info("(%s:%d: %s) Job ID (in JOB): %" PRIu32 CRAY_INFO("Job ID (in JOB): %" PRIu32
"Job ID (in Switch jobinfo): %" PRIu32, "Job ID (in Switch jobinfo): %" PRIu32,
THIS_FILE, __LINE__, __FUNCTION__, job->jobid, job->jobid, sw_job->jobid);
sw_job->jobid);
} }
rc = alpsc_attach_cncu_container(&err_msg, sw_job->jobid, job->cont_id); rc = alpsc_attach_cncu_container(&err_msg, sw_job->jobid, job->cont_id);
...@@ -831,8 +811,6 @@ extern int switch_p_job_init(stepd_step_rec_t *job) ...@@ -831,8 +811,6 @@ extern int switch_p_job_init(stepd_step_rec_t *job)
/* /*
* Query the generic resources to see if the GPU should be allocated * Query the generic resources to see if the GPU should be allocated
* TODO: Determine whether the proxy should be enabled or disabled by
* reading the user's environment variable.
*/ */
rc = gres_get_step_info(job->step_gres_list, "gpu", 0, rc = gres_get_step_info(job->step_gres_list, "gpu", 0,
...@@ -950,6 +928,7 @@ int switch_p_job_postfini(stepd_step_rec_t *job) ...@@ -950,6 +928,7 @@ int switch_p_job_postfini(stepd_step_rec_t *job)
/* /*
* Clean-up * Clean-up
* *
* 0. Reset GPU proxy
* 1. Flush Lustre caches * 1. Flush Lustre caches
* 2. Flush virtual memory * 2. Flush virtual memory
* 3. Compact memory * 3. Compact memory
...@@ -1089,8 +1068,7 @@ extern int switch_p_job_step_complete(switch_jobinfo_t *jobinfo, ...@@ -1089,8 +1068,7 @@ extern int switch_p_job_step_complete(switch_jobinfo_t *jobinfo,
*/ */
rc = release_port(job->port); rc = release_port(job->port);
if (rc != 0) { if (rc != 0) {
error("(%s: %d: %s) Releasing port %" PRIu32 " failed.", CRAY_ERR("Releasing port %" PRIu32 " failed.", job->port);
THIS_FILE, __LINE__, __FUNCTION__, job->port);
// return SLURM_ERROR; // return SLURM_ERROR;
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment