diff --git a/src/common/checkpoint.c b/src/common/checkpoint.c index 347c95f24cb6f1ced076880ec82112726ba5eb92..eee6ba454f6a70b794b278057edfa312c44c3966 100644 --- a/src/common/checkpoint.c +++ b/src/common/checkpoint.c @@ -72,8 +72,10 @@ typedef struct slurm_checkpoint_ops { int (*ckpt_alloc_jobinfo) (check_jobinfo_t *jobinfo); int (*ckpt_free_jobinfo) (check_jobinfo_t jobinfo); - int (*ckpt_pack_jobinfo) (check_jobinfo_t jobinfo, Buf buffer); - int (*ckpt_unpack_jobinfo) (check_jobinfo_t jobinfo, Buf buffer); + int (*ckpt_pack_jobinfo) (check_jobinfo_t jobinfo, Buf buffer, + uint16_t protocol_version); + int (*ckpt_unpack_jobinfo) (check_jobinfo_t jobinfo, Buf buffer, + uint16_t protocol_version); int (*ckpt_stepd_prefork) (void *slurmd_job); int (*ckpt_signal_tasks) (void *slurmd_job, char *image_dir); int (*ckpt_restart_task) (void *slurmd_job, char *image_dir, int gtid); @@ -368,14 +370,15 @@ extern int checkpoint_free_jobinfo(check_jobinfo_t jobinfo) } /* un/pack a job step's checkpoint context */ -extern int checkpoint_pack_jobinfo (check_jobinfo_t jobinfo, Buf buffer) +extern int checkpoint_pack_jobinfo (check_jobinfo_t jobinfo, Buf buffer, + uint16_t protocol_version) { int retval = SLURM_SUCCESS; slurm_mutex_lock( &context_lock ); if ( g_context ) retval = (*(g_context->ops.ckpt_pack_jobinfo))( - jobinfo, buffer); + jobinfo, buffer, protocol_version); else { error ("slurm_checkpoint plugin context not initialized"); retval = ENOENT; @@ -384,14 +387,15 @@ extern int checkpoint_pack_jobinfo (check_jobinfo_t jobinfo, Buf buffer) return retval; } -extern int checkpoint_unpack_jobinfo (check_jobinfo_t jobinfo, Buf buffer) +extern int checkpoint_unpack_jobinfo (check_jobinfo_t jobinfo, Buf buffer, + uint16_t protocol_version) { int retval = SLURM_SUCCESS; slurm_mutex_lock( &context_lock ); if ( g_context ) retval = (*(g_context->ops.ckpt_unpack_jobinfo))( - jobinfo, buffer); + jobinfo, buffer, protocol_version); else { error ("slurm_checkpoint plugin context not initialized"); retval = ENOENT; diff --git a/src/common/checkpoint.h b/src/common/checkpoint.h index d65c22f335fe537d41474613d5d65744f9ff9abf..b7bfc90fe4332dbf0ae6c0c995ded82687e270ea 100644 --- a/src/common/checkpoint.h +++ b/src/common/checkpoint.h @@ -79,11 +79,12 @@ extern int checkpoint_op(uint32_t job_id, uint32_t step_id, uint32_t *error_code, char **error_msg); /* note checkpoint completion */ -extern int checkpoint_comp(void * step_ptr, time_t event_time, uint32_t error_code, - char *error_msg); +extern int checkpoint_comp(void * step_ptr, time_t event_time, + uint32_t error_code, char *error_msg); extern int checkpoint_task_comp(void * step_ptr, uint32_t task_id, - time_t event_time, uint32_t error_code, char *error_msg); + time_t event_time, uint32_t error_code, + char *error_msg); /* gather checkpoint error info */ extern int checkpoint_error(void * step_ptr, @@ -96,8 +97,10 @@ extern int checkpoint_alloc_jobinfo(check_jobinfo_t *jobinfo); extern int checkpoint_free_jobinfo(check_jobinfo_t jobinfo); /* un/pack a job step's checkpoint context */ -extern int checkpoint_pack_jobinfo (check_jobinfo_t jobinfo, Buf buffer); -extern int checkpoint_unpack_jobinfo (check_jobinfo_t jobinfo, Buf buffer); +extern int checkpoint_pack_jobinfo (check_jobinfo_t jobinfo, Buf buffer, + uint16_t protocol_version); +extern int checkpoint_unpack_jobinfo (check_jobinfo_t jobinfo, Buf buffer, + uint16_t protocol_version); /* create the necessary threads before forking the tasks */ extern int checkpoint_stepd_prefork (void *slurmd_job); diff --git a/src/plugins/checkpoint/aix/checkpoint_aix.c b/src/plugins/checkpoint/aix/checkpoint_aix.c index 04fcdbaff9e7dcc7ca5f886528399cd5acd5b863..41baffc84d0795e6fee02f72e7b17d466a9e61ff 100644 --- a/src/plugins/checkpoint/aix/checkpoint_aix.c +++ b/src/plugins/checkpoint/aix/checkpoint_aix.c @@ -311,37 +311,44 @@ extern int slurm_ckpt_free_job(check_jobinfo_t jobinfo) return SLURM_SUCCESS; } -extern int slurm_ckpt_pack_job(check_jobinfo_t jobinfo, Buf buffer) +extern int slurm_ckpt_pack_job(check_jobinfo_t jobinfo, Buf buffer, + uint16_t protocol_version) { struct check_job_info *check_ptr = (struct check_job_info *)jobinfo; - pack16(check_ptr->disabled, buffer); - pack16(check_ptr->node_cnt, buffer); - pack16(check_ptr->reply_cnt, buffer); - pack16(check_ptr->wait_time, buffer); + if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { + pack16(check_ptr->disabled, buffer); + pack16(check_ptr->node_cnt, buffer); + pack16(check_ptr->reply_cnt, buffer); + pack16(check_ptr->wait_time, buffer); - pack32(check_ptr->error_code, buffer); - packstr(check_ptr->error_msg, buffer); - pack_time(check_ptr->time_stamp, buffer); + pack32(check_ptr->error_code, buffer); + packstr(check_ptr->error_msg, buffer); + pack_time(check_ptr->time_stamp, buffer); + } return SLURM_SUCCESS; } -extern int slurm_ckpt_unpack_job(check_jobinfo_t jobinfo, Buf buffer) +extern int slurm_ckpt_unpack_job(check_jobinfo_t jobinfo, Buf buffer, + uint16_t protocol_version) { uint32_t uint32_tmp; struct check_job_info *check_ptr = (struct check_job_info *)jobinfo; - safe_unpack16(&check_ptr->disabled, buffer); - safe_unpack16(&check_ptr->node_cnt, buffer); - safe_unpack16(&check_ptr->reply_cnt, buffer); - safe_unpack16(&check_ptr->wait_time, buffer); + if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { + safe_unpack16(&check_ptr->disabled, buffer); + safe_unpack16(&check_ptr->node_cnt, buffer); + safe_unpack16(&check_ptr->reply_cnt, buffer); + safe_unpack16(&check_ptr->wait_time, buffer); - safe_unpack32(&check_ptr->error_code, buffer); - safe_unpackstr_xmalloc(&check_ptr->error_msg, &uint32_tmp, buffer); - safe_unpack_time(&check_ptr->time_stamp, buffer); + safe_unpack32(&check_ptr->error_code, buffer); + safe_unpackstr_xmalloc(&check_ptr->error_msg, + &uint32_tmp, buffer); + safe_unpack_time(&check_ptr->time_stamp, buffer); + } return SLURM_SUCCESS; diff --git a/src/plugins/checkpoint/blcr/checkpoint_blcr.c b/src/plugins/checkpoint/blcr/checkpoint_blcr.c index 05fe622bd73c6367d63cbe4775db3e439051131d..86a4b677c5ea2047b8f4b10623fe6fbed0fc26da 100644 --- a/src/plugins/checkpoint/blcr/checkpoint_blcr.c +++ b/src/plugins/checkpoint/blcr/checkpoint_blcr.c @@ -321,29 +321,36 @@ extern int slurm_ckpt_free_job(check_jobinfo_t jobinfo) return SLURM_SUCCESS; } -extern int slurm_ckpt_pack_job(check_jobinfo_t jobinfo, Buf buffer) +extern int slurm_ckpt_pack_job(check_jobinfo_t jobinfo, Buf buffer, + uint16_t protocol_version) { struct check_job_info *check_ptr = (struct check_job_info *)jobinfo; - pack16(check_ptr->disabled, buffer); - pack_time(check_ptr->time_stamp, buffer); - pack32(check_ptr->error_code, buffer); - packstr(check_ptr->error_msg, buffer); + if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { + pack16(check_ptr->disabled, buffer); + pack_time(check_ptr->time_stamp, buffer); + pack32(check_ptr->error_code, buffer); + packstr(check_ptr->error_msg, buffer); + } return SLURM_SUCCESS; } -extern int slurm_ckpt_unpack_job(check_jobinfo_t jobinfo, Buf buffer) +extern int slurm_ckpt_unpack_job(check_jobinfo_t jobinfo, Buf buffer, + uint16_t protocol_version) { uint32_t uint32_tmp; struct check_job_info *check_ptr = (struct check_job_info *)jobinfo; - safe_unpack16(&check_ptr->disabled, buffer); - safe_unpack_time(&check_ptr->time_stamp, buffer); - safe_unpack32(&check_ptr->error_code, buffer); - safe_unpackstr_xmalloc(&check_ptr->error_msg, &uint32_tmp, buffer); + if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { + safe_unpack16(&check_ptr->disabled, buffer); + safe_unpack_time(&check_ptr->time_stamp, buffer); + safe_unpack32(&check_ptr->error_code, buffer); + safe_unpackstr_xmalloc(&check_ptr->error_msg, + &uint32_tmp, buffer); + } return SLURM_SUCCESS; diff --git a/src/plugins/checkpoint/none/checkpoint_none.c b/src/plugins/checkpoint/none/checkpoint_none.c index 4e4259c51bd31218476900241ec0bc3312196321..253c62a71c21b9ebec647c349d304ae8c9014c65 100644 --- a/src/plugins/checkpoint/none/checkpoint_none.c +++ b/src/plugins/checkpoint/none/checkpoint_none.c @@ -128,12 +128,14 @@ extern int slurm_ckpt_free_job(check_jobinfo_t jobinfo) return SLURM_SUCCESS; } -extern int slurm_ckpt_pack_job(check_jobinfo_t jobinfo, Buf buffer) +extern int slurm_ckpt_pack_job(check_jobinfo_t jobinfo, Buf buffer, + uint16_t protocol_version) { return SLURM_SUCCESS; } -extern int slurm_ckpt_unpack_job(check_jobinfo_t jobinfo, Buf buffer) +extern int slurm_ckpt_unpack_job(check_jobinfo_t jobinfo, Buf buffer, + uint16_t protocol_version) { return SLURM_SUCCESS; } diff --git a/src/plugins/checkpoint/ompi/checkpoint_ompi.c b/src/plugins/checkpoint/ompi/checkpoint_ompi.c index a69ce989aa9ddb67995a0d68b7e2e58839983c8a..2d53e36a3ae0f9bbd5f752acc00418d234ea08c5 100644 --- a/src/plugins/checkpoint/ompi/checkpoint_ompi.c +++ b/src/plugins/checkpoint/ompi/checkpoint_ompi.c @@ -243,35 +243,42 @@ extern int slurm_ckpt_free_job(check_jobinfo_t jobinfo) return SLURM_SUCCESS; } -extern int slurm_ckpt_pack_job(check_jobinfo_t jobinfo, Buf buffer) +extern int slurm_ckpt_pack_job(check_jobinfo_t jobinfo, Buf buffer, + uint16_t protocol_version) { struct check_job_info *check_ptr = (struct check_job_info *)jobinfo; - pack16(check_ptr->disabled, buffer); - pack16(check_ptr->reply_cnt, buffer); - pack16(check_ptr->wait_time, buffer); + if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { + pack16(check_ptr->disabled, buffer); + pack16(check_ptr->reply_cnt, buffer); + pack16(check_ptr->wait_time, buffer); - pack32(check_ptr->error_code, buffer); - packstr(check_ptr->error_msg, buffer); - pack_time(check_ptr->time_stamp, buffer); + pack32(check_ptr->error_code, buffer); + packstr(check_ptr->error_msg, buffer); + pack_time(check_ptr->time_stamp, buffer); + } return SLURM_SUCCESS; } -extern int slurm_ckpt_unpack_job(check_jobinfo_t jobinfo, Buf buffer) +extern int slurm_ckpt_unpack_job(check_jobinfo_t jobinfo, Buf buffer, + uint16_t protocol_version) { uint32_t uint32_tmp; struct check_job_info *check_ptr = (struct check_job_info *)jobinfo; - safe_unpack16(&check_ptr->disabled, buffer); - safe_unpack16(&check_ptr->reply_cnt, buffer); - safe_unpack16(&check_ptr->wait_time, buffer); + if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { + safe_unpack16(&check_ptr->disabled, buffer); + safe_unpack16(&check_ptr->reply_cnt, buffer); + safe_unpack16(&check_ptr->wait_time, buffer); - safe_unpack32(&check_ptr->error_code, buffer); - safe_unpackstr_xmalloc(&check_ptr->error_msg, &uint32_tmp, buffer); - safe_unpack_time(&check_ptr->time_stamp, buffer); + safe_unpack32(&check_ptr->error_code, buffer); + safe_unpackstr_xmalloc(&check_ptr->error_msg, + &uint32_tmp, buffer); + safe_unpack_time(&check_ptr->time_stamp, buffer); + } return SLURM_SUCCESS; diff --git a/src/plugins/checkpoint/xlch/checkpoint_xlch.c b/src/plugins/checkpoint/xlch/checkpoint_xlch.c index f0627a264a60c0c32a108e128a0ee2153cf6e27c..682723e6497b7d79016db5a6b3e00eb2fb3027ee 100644 --- a/src/plugins/checkpoint/xlch/checkpoint_xlch.c +++ b/src/plugins/checkpoint/xlch/checkpoint_xlch.c @@ -382,47 +382,54 @@ extern int slurm_ckpt_free_job(check_jobinfo_t jobinfo) return SLURM_SUCCESS; } -extern int slurm_ckpt_pack_job(check_jobinfo_t jobinfo, Buf buffer) +extern int slurm_ckpt_pack_job(check_jobinfo_t jobinfo, Buf buffer, + uint16_t protocol_version) { struct check_job_info *check_ptr = (struct check_job_info *)jobinfo; - pack16(check_ptr->disabled, buffer); - pack16(check_ptr->task_cnt, buffer); - pack16(check_ptr->reply_cnt, buffer); - pack16(check_ptr->wait_time, buffer); - pack_bit_fmt(check_ptr->replied, buffer); + if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { + pack16(check_ptr->disabled, buffer); + pack16(check_ptr->task_cnt, buffer); + pack16(check_ptr->reply_cnt, buffer); + pack16(check_ptr->wait_time, buffer); + pack_bit_fmt(check_ptr->replied, buffer); - pack32(check_ptr->error_code, buffer); - packstr(check_ptr->error_msg, buffer); - pack_time(check_ptr->time_stamp, buffer); + pack32(check_ptr->error_code, buffer); + packstr(check_ptr->error_msg, buffer); + pack_time(check_ptr->time_stamp, buffer); + } return SLURM_SUCCESS; } -extern int slurm_ckpt_unpack_job(check_jobinfo_t jobinfo, Buf buffer) +extern int slurm_ckpt_unpack_job(check_jobinfo_t jobinfo, Buf buffer, + uint16_t protocol_version) { uint32_t uint32_tmp; char *task_inx_str; struct check_job_info *check_ptr = (struct check_job_info *)jobinfo; - safe_unpack16(&check_ptr->disabled, buffer); - safe_unpack16(&check_ptr->task_cnt, buffer); - safe_unpack16(&check_ptr->reply_cnt, buffer); - safe_unpack16(&check_ptr->wait_time, buffer); - safe_unpackstr_xmalloc(&task_inx_str, &uint32_tmp, buffer); - if (task_inx_str == NULL) - check_ptr->replied = NULL; - else { - check_ptr->replied = bit_alloc(check_ptr->task_cnt); - bit_unfmt(check_ptr->replied, task_inx_str); - xfree(task_inx_str); - } + if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { + safe_unpack16(&check_ptr->disabled, buffer); + safe_unpack16(&check_ptr->task_cnt, buffer); + safe_unpack16(&check_ptr->reply_cnt, buffer); + safe_unpack16(&check_ptr->wait_time, buffer); + safe_unpackstr_xmalloc(&task_inx_str, &uint32_tmp, buffer); + if (task_inx_str == NULL) + check_ptr->replied = NULL; + else { + check_ptr->replied = bit_alloc(check_ptr->task_cnt); + bit_unfmt(check_ptr->replied, task_inx_str); + xfree(task_inx_str); + } - safe_unpack32(&check_ptr->error_code, buffer); - safe_unpackstr_xmalloc(&check_ptr->error_msg, &uint32_tmp, buffer); - safe_unpack_time(&check_ptr->time_stamp, buffer); + safe_unpack32(&check_ptr->error_code, buffer); + safe_unpackstr_xmalloc(&check_ptr->error_msg, + &uint32_tmp, buffer); + safe_unpack_time(&check_ptr->time_stamp, buffer); + } return SLURM_SUCCESS; diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 001ea978d12f5f8d98231630343af8b837ab8177..c3d796a4cd3693005035dde177ec583daa2547d4 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -782,7 +782,8 @@ static void _dump_job_state(struct job_record *dump_job_ptr, Buf buffer) SLURM_PROTOCOL_VERSION); pack16(dump_job_ptr->ckpt_interval, buffer); - checkpoint_pack_jobinfo(dump_job_ptr->check_job, buffer); + checkpoint_pack_jobinfo(dump_job_ptr->check_job, buffer, + SLURM_PROTOCOL_VERSION); packstr_array(dump_job_ptr->spank_job_env, dump_job_ptr->spank_job_env_size, buffer); @@ -946,7 +947,8 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) safe_unpack16(&ckpt_interval, buffer); if (checkpoint_alloc_jobinfo(&check_job) || - checkpoint_unpack_jobinfo(check_job, buffer)) + checkpoint_unpack_jobinfo(check_job, buffer, + protocol_version)) goto unpack_error; safe_unpackstr_array(&spank_job_env, &spank_job_env_size, @@ -1091,7 +1093,8 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) safe_unpack16(&ckpt_interval, buffer); if (checkpoint_alloc_jobinfo(&check_job) || - checkpoint_unpack_jobinfo(check_job, buffer)) + checkpoint_unpack_jobinfo(check_job, buffer, + protocol_version)) goto unpack_error; safe_unpackstr_array(&spank_job_env, &spank_job_env_size, diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 86069aa35c6c112ad79310de75e99a82b71247ef..3f0466a58a5e6a23e443c30b05389c137d7ef360 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -2483,7 +2483,8 @@ extern void dump_job_step_state(struct job_record *job_ptr, SLURM_PROTOCOL_VERSION); switch_pack_jobinfo(step_ptr->switch_job, buffer); } - checkpoint_pack_jobinfo(step_ptr->check_job, buffer); + checkpoint_pack_jobinfo(step_ptr->check_job, buffer, + SLURM_PROTOCOL_VERSION); } /* @@ -2558,7 +2559,8 @@ extern int load_step_state(struct job_record *job_ptr, Buf buffer, goto unpack_error; } checkpoint_alloc_jobinfo(&check_tmp); - if (checkpoint_unpack_jobinfo(check_tmp, buffer)) + if (checkpoint_unpack_jobinfo(check_tmp, buffer, + protocol_version)) goto unpack_error; /* validity test as possible */ @@ -2614,7 +2616,8 @@ extern int load_step_state(struct job_record *job_ptr, Buf buffer, goto unpack_error; } checkpoint_alloc_jobinfo(&check_tmp); - if (checkpoint_unpack_jobinfo(check_tmp, buffer)) + if (checkpoint_unpack_jobinfo(check_tmp, buffer, + protocol_version)) goto unpack_error; /* validity test as possible */