From 05f38e08bcf3ed063f9d9efd4221c82582bfecc8 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Mon, 10 Mar 2008 16:12:50 +0000 Subject: [PATCH] svn merge -r13515:13533 https://eris.llnl.gov/svn/slurm/branches/slurm-1.2 --- NEWS | 5 +- slurm/slurm.h.in | 95 +++++++++++++-------------- src/api/job_info.c | 4 +- src/common/slurm_protocol_pack.c | 2 +- src/scontrol/update_job.c | 10 +++ src/slurmctld/job_mgr.c | 31 +++++++-- src/slurmctld/slurmctld.h | 107 ++++++++++++++++--------------- 7 files changed, 146 insertions(+), 108 deletions(-) diff --git a/NEWS b/NEWS index 36b9c098ae1..a0f61ba81c8 100644 --- a/NEWS +++ b/NEWS @@ -6,6 +6,8 @@ documents those changes that are of interest to users and admins. -- Restructure the sbcast RPC to take advantage of larger buffers available in Slurm v1.3 RPCs. -- Fix several memory leaks. + -- In scontrol, show job's Requeue value, permit change of Requeue and COmment + values. * Changes in SLURM 1.3.0-pre10 ============================== @@ -204,7 +206,8 @@ documents those changes that are of interest to users and admins. to control message timeout. -- Add threaded agent to manage a queue of Gold update requests for performance reasons. - -- Add slloc options --chdir and --get-user-env. + -- Add salloc options --chdir and --get-user-env (for Moab). + -- Modify scontrol update to support job comment changes. * Changes in SLURM 1.2.24 ========================= diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index dedb2dfdcd8..8eee044c451 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -577,27 +577,62 @@ typedef struct job_descriptor { /* For submit, allocate, and update requests */ } job_desc_msg_t; typedef struct job_info { - uint32_t job_id; /* job ID */ - char *name; /* name of the job */ - uint16_t batch_flag; /* 1 if batch: queued job with script */ - uint32_t alloc_sid; /* local sid making resource alloc */ + char *account; /* charge to specified account */ char *alloc_node; /* local node making resource alloc */ - uint32_t user_id; /* user the job runs as */ + uint32_t alloc_sid; /* local sid making resource alloc */ + uint16_t batch_flag; /* 1 if batch: queued job with script */ + char *command; /* command to be executed */ + char *comment; /* arbitrary comment (used by Moab scheduler) */ + uint16_t contiguous; /* 1 if job requires contiguous nodes */ + uint16_t cpus_per_task; /* number of processors required for each task */ + char *dependency; /* syncrhonize job execution with other jobs */ + time_t end_time; /* time of termination, actual or expected */ + char *exc_nodes; /* comma separated list of excluded nodes */ + int *exc_node_inx; /* excluded list index pairs into node_table: + * start_range_1, end_range_1, + * start_range_2, .., -1 */ + uint32_t exit_code; /* exit code for job (status from wait call) */ + char *features; /* comma separated list of required features */ uint32_t group_id; /* group job sumitted as */ + uint32_t job_id; /* job ID */ + uint16_t job_min_cores; /* minimum cores per processor, default=0 */ + uint32_t job_min_memory; /* minimum real memory per node, default=0 */ + uint16_t job_min_procs; /* minimum processors per node, default=0 */ + uint16_t job_min_sockets; /* minimum sockets per node, default=0 */ + uint16_t job_min_threads; /* minimum threads per core, default=0 */ + uint32_t job_min_tmp_disk; /* minimum tmp disk per node, default=0 */ uint16_t job_state; /* state of the job, see enum job_states */ - uint32_t time_limit; /* maximum run time in minutes or INFINITE */ - time_t submit_time; /* time of job submission */ - time_t start_time; /* time execution begins, actual or expected */ - time_t end_time; /* time of termination, actual or expected */ - time_t suspend_time; /* time job last suspended or resumed */ - time_t pre_sus_time; /* time job ran prior to last suspend */ - uint32_t priority; /* relative priority of the job, - * 0=held, 1=required nodes DOWN/DRAINED */ + char *licenses; /* licenses required by the job */ + char *name; /* name of the job */ + char *network; /* network specification */ char *nodes; /* list of nodes allocated to job */ int *node_inx; /* list index pairs into node_table for *nodes: * start_range_1, end_range_1, * start_range_2, .., -1 */ + uint16_t ntasks_per_core;/* number of tasks to invoke on each core */ + uint16_t ntasks_per_node;/* number of tasks to invoke on each node */ + uint16_t ntasks_per_socket;/* number of tasks to invoke on each socket */ char *partition; /* name of assigned partition */ + time_t pre_sus_time; /* time job ran prior to last suspend */ + uint32_t priority; /* relative priority of the job, + * 0=held, 1=required nodes DOWN/DRAINED */ + char *req_nodes; /* comma separated list of required nodes */ + int *req_node_inx; /* required list index pairs into node_table: + * start_range_1, end_range_1, + * start_range_2, .., -1 */ + uint16_t requeue; /* enable or disable job requeue option */ + select_jobinfo_t select_jobinfo; /* opaque data type, + * process using select_g_get_jobinfo() */ + uint16_t shared; /* 1 if job can share nodes with other jobs */ + time_t start_time; /* time execution begins, actual or expected */ + uint16_t state_reason; /* reason job still pending or failed, see + * slurm.h:enum job_state_reason */ + time_t submit_time; /* time of job submission */ + time_t suspend_time; /* time job last suspended or resumed */ + uint32_t time_limit; /* maximum run time in minutes or INFINITE */ + uint32_t user_id; /* user the job runs as */ + char *work_dir; /* pathname of working directory */ + uint16_t num_cpu_groups;/* elements in below cpu arrays */ uint32_t *cpus_per_node;/* cpus per node */ uint32_t *cpu_count_reps;/* how many nodes have same cpu count */ @@ -610,40 +645,6 @@ typedef struct job_info { uint16_t max_cores; /* maximum number of cores per cpu */ uint16_t min_threads; /* minimum number of threads per core */ uint16_t max_threads; /* maximum number of threads per core */ - uint16_t shared; /* 1 if job can share nodes with other jobs */ - uint16_t contiguous; /* 1 if job requires contiguous nodes */ - uint16_t cpus_per_task; /* number of processors required for each task */ - uint16_t ntasks_per_node;/* number of tasks to invoke on each node */ - uint16_t ntasks_per_socket;/* number of tasks to invoke on each socket */ - uint16_t ntasks_per_core;/* number of tasks to invoke on each core */ - /* job constraints: */ - uint16_t job_min_procs; /* minimum processors per node, default=0 */ - uint16_t job_min_sockets; /* minimum sockets per node, default=0 */ - uint16_t job_min_cores; /* minimum cores per processor, default=0 */ - uint16_t job_min_threads; /* minimum threads per core, default=0 */ - uint32_t job_min_memory; /* minimum real memory per node, default=0 */ - uint32_t job_min_tmp_disk; /* minimum tmp disk per node, default=0 */ - char *req_nodes; /* comma separated list of required nodes */ - int *req_node_inx; /* required list index pairs into node_table: - * start_range_1, end_range_1, - * start_range_2, .., -1 */ - char *exc_nodes; /* comma separated list of excluded nodes */ - int *exc_node_inx; /* excluded list index pairs into node_table: - * start_range_1, end_range_1, - * start_range_2, .., -1 */ - char *features; /* comma separated list of required features */ - char *dependency; /* syncrhonize job execution with other jobs */ - uint32_t exit_code; /* exit code for job (status from wait call) */ - char *account; /* charge to specified account */ - uint16_t state_reason; /* reason job still pending or failed, see - * slurm.h:enum job_state_reason */ - char *network; /* network specification */ - char *comment; /* arbitrary comment (used by Moab scheduler) */ - char *work_dir; /* pathname of working directory */ - char *command; /* command to be executed */ - select_jobinfo_t select_jobinfo; /* opaque data type, - * process using select_g_get_jobinfo() */ - char *licenses; /* licenses required by the job */ } job_info_t; typedef struct job_info_msg { diff --git a/src/api/job_info.c b/src/api/job_info.c index e8a7ffa53d7..2df156bc8ce 100644 --- a/src/api/job_info.c +++ b/src/api/job_info.c @@ -381,8 +381,8 @@ slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner ) /****** Line 11 ******/ snprintf(tmp_line, sizeof(tmp_line), - "Dependency=%s Account=%s", - job_ptr->dependency, job_ptr->account); + "Dependency=%s Account=%s Requeue=%u", + job_ptr->dependency, job_ptr->account, job_ptr->requeue); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index e8b87494276..f1446dfcaea 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -2122,7 +2122,7 @@ _unpack_job_info_members(job_info_t * job, Buf buffer) safe_unpack32(&job->num_nodes, buffer); safe_unpack32(&job->max_nodes, buffer); - + safe_unpack16(&job->requeue, buffer); /*** unpack pending job details ***/ safe_unpack16(&job->shared, buffer); diff --git a/src/scontrol/update_job.c b/src/scontrol/update_job.c index 3813fdeb44b..540c216f617 100644 --- a/src/scontrol/update_job.c +++ b/src/scontrol/update_job.c @@ -209,6 +209,10 @@ scontrol_update_job (int argc, char *argv[]) job_msg.job_id = (uint32_t) strtol(&argv[i][6], (char **) NULL, 10); + else if (strncasecmp(argv[i], "Comment=", 8) == 0) { + job_msg.comment = &argv[i][8]; + update_cnt++; + } else if (strncasecmp(argv[i], "TimeLimit=", 10) == 0) { int time_limit = time_str2mins(&argv[i][10]); if ((time_limit < 0) && (time_limit != INFINITE)) { @@ -247,6 +251,12 @@ scontrol_update_job (int argc, char *argv[]) (char **) NULL, 10); update_cnt++; } + else if (strncasecmp(argv[i], "Requeue=", 8) == 0) { + job_msg.requeue = + (uint16_t) strtol(&argv[i][8], + (char **) NULL, 10); + update_cnt++; + } else if ((strncasecmp(argv[i], "MinNodes=", 9) == 0) || (strncasecmp(argv[i], "ReqNodes=", 9) == 0)) { char *tmp; diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index cce759191f9..c969f049825 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -3098,6 +3098,7 @@ static void _pack_default_job_details(struct job_details *detail_ptr, pack32(detail_ptr->min_nodes, buffer); pack32(detail_ptr->max_nodes, buffer); + pack16(detail_ptr->requeue, buffer); } else { packnull(buffer); packnull(buffer); @@ -3106,6 +3107,7 @@ static void _pack_default_job_details(struct job_details *detail_ptr, pack32((uint32_t) 0, buffer); pack32((uint32_t) 0, buffer); + pack16((uint16_t) 0, buffer); } } @@ -3785,6 +3787,9 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) info("update_job: setting features to %s for " "job_id %u", job_specs->features, job_specs->job_id); + } else { + info("update_job: cleared features for job %u", + job_specs->job_id); } } else { error("Attempt to change features for job %u", @@ -3793,11 +3798,26 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) } } + if (job_specs->comment) { + xfree(job_ptr->comment); + job_ptr->comment = job_specs->comment; + job_specs->comment = NULL; /* Nothing left to free */ + info("update_job: setting comment to %s for job_id %u", + job_ptr->comment, job_specs->job_id); + } + if (job_specs->name) { xfree(job_ptr->name); - job_ptr->name = xstrdup(job_specs->name); + job_ptr->name = job_specs->name; + job_specs->name = NULL; /* Nothing left to free */ info("update_job: setting name to %s for job_id %u", - job_specs->name, job_specs->job_id); + job_ptr->name, job_specs->job_id); + } + + if (job_specs->requeue) { + detail_ptr->requeue = job_specs->requeue; + info("update_job: setting requeue to %u for job_id %u", + job_specs->requeue, job_specs->job_id); } if (job_specs->partition) { @@ -3881,10 +3901,13 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) if (job_specs->account) { xfree(job_ptr->account); if (job_specs->account[0] != '\0') { - job_ptr->account = job_specs->account ; + job_ptr->account = job_specs->account; + job_specs->account = NULL; /* Nothing left to free */ info("update_job: setting account to %s for job_id %u", job_ptr->account, job_specs->job_id); - job_specs->account = NULL; + } else { + info("update_job: cleared account for job_id %u", + job_specs->job_id); } } diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 82938e618e8..7be782183b1 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -337,73 +337,83 @@ struct job_details { }; struct job_record { - uint32_t job_id; /* job ID */ - uint32_t magic; /* magic cookie for data integrity */ - char *name; /* name of the job */ - char *partition; /* name of the partition */ - struct part_record *part_ptr; /* pointer to the partition record */ + char *account; /* account number to charge */ + char *alloc_node; /* local node making resource alloc */ + uint16_t alloc_resp_port; /* RESPONSE_RESOURCE_ALLOCATION port */ + uint32_t alloc_sid; /* local sid making resource alloc */ uint16_t batch_flag; /* 1 or 2 if batch job (with script), * 2 indicates retry mode (one retry) */ - uint32_t user_id; /* user the job runs as */ + char *comment; /* arbitrary comment */ + uint16_t cr_enabled; /* specify if if Consumable Resources + * is enabled. Needed since CR deals + * with a finer granularity in its + * node/cpu scheduling (available cpus + * instead of available nodes) than the + * bluegene and the linear plugins + * 0 if cr is NOT enabled, + * 1 if cr is enabled */ + uint32_t db_index; /* used only for database + plugins */ + struct job_details *details; /* job details */ + time_t end_time; /* time of termination, + * actual or expected */ + uint32_t exit_code; /* exit code for job (status from + * wait call) */ uint32_t group_id; /* group submitted under */ + uint32_t job_id; /* job ID */ + struct job_record *job_next; /* next entry with same hash index */ enum job_states job_state; /* state of the job */ uint16_t kill_on_node_fail; /* 1 if job should be killed on * node failure */ uint16_t kill_on_step_done; /* 1 if job should be killed when * the job step completes, 2 if kill * in progress */ - select_jobinfo_t select_jobinfo;/* opaque data */ + char *licenses; /* licenses required by the job */ + uint16_t mail_type; /* see MAIL_JOB_* in slurm.h */ + char *mail_user; /* user to get e-mail notification */ + uint32_t magic; /* magic cookie for data integrity */ + char *name; /* name of the job */ + char *network; /* network/switch requirement spec */ + uint16_t next_step_id; /* next step id to be used */ char *nodes; /* list of nodes allocated to job */ + slurm_addr *node_addr; /* addresses of the nodes allocated to + * job */ bitstr_t *node_bitmap; /* bitmap of nodes allocated to job */ + uint32_t node_cnt; /* count of nodes allocated to job */ char *nodes_completing; /* nodes still in completing state * for this job, used to insure * epilog is not re-run for job */ uint32_t num_procs; /* count of required processors */ - uint32_t total_procs; /* number of allocated processors, - for accounting */ - uint32_t time_limit; /* time_limit minutes or INFINITE, - * NO_VAL implies partition max_time */ + uint16_t other_port; /* port for client communications */ + char *partition; /* name of the partition */ + time_t pre_sus_time; /* time job ran prior to last suspend */ + uint32_t priority; /* relative priority of the job, + * zero == held (don't initiate) */ + uint32_t requid; /* requester user ID */ + char *resp_host; /* host for srun communications */ + select_jobinfo_t select_jobinfo;/* opaque data */ time_t start_time; /* time execution begins, * actual or expected */ - time_t end_time; /* time of termination, - * actual or expected */ + uint16_t state_reason; /* reason job still pending or failed + * see slurm.h:enum job_wait_reason */ + List step_list; /* list of job's steps */ time_t suspend_time; /* time job last suspended or resumed */ - time_t pre_sus_time; /* time job ran prior to last suspend */ - time_t tot_sus_time; /* total time in suspend state */ time_t time_last_active; /* time of last job activity */ - uint32_t priority; /* relative priority of the job, - * zero == held (don't initiate) */ - struct job_details *details; /* job details */ + uint32_t time_limit; /* time_limit minutes or INFINITE, + * NO_VAL implies partition max_time */ + time_t tot_sus_time; /* total time in suspend state */ + uint32_t total_procs; /* number of allocated processors, + for accounting */ + struct part_record *part_ptr; /* pointer to the partition record */ + uint32_t user_id; /* user the job runs as */ + + /* Per node allocation details */ uint16_t num_cpu_groups; /* record count in cpus_per_node and * cpu_count_reps */ uint32_t *cpus_per_node; /* array of cpus per node allocated */ uint32_t *cpu_count_reps; /* array of consecutive nodes with * same cpu count */ - uint32_t alloc_sid; /* local sid making resource alloc */ - char *alloc_node; /* local node making resource alloc */ - uint16_t next_step_id; /* next step id to be used */ - uint32_t node_cnt; /* count of nodes allocated to job */ - slurm_addr *node_addr; /* addresses of the nodes allocated to - * job */ - List step_list; /* list of job's steps */ - char *resp_host; /* host for srun communications */ - uint16_t alloc_resp_port; /* RESPONSE_RESOURCE_ALLOCATION port */ - uint16_t other_port; /* port for client communications */ - char *account; /* account number to charge */ - char *comment; /* arbitrary comment */ - char *network; /* network/switch requirement spec */ - struct job_record *job_next; /* next entry with same hash index */ - uint16_t cr_enabled; /* specify if if Consumable - * Resources is - * enabled. Needed since CR - * deals with a finer - * granularity in its node/cpu - * scheduling (available cpus - * instead of available nodes) - * than the bluegene and the - * linear plugins - * 0 if cr is NOT enabled, - * 1 if cr is enabled */ + uint32_t alloc_lps_cnt; /* number of hosts in alloc_lps * or 0 if alloc_lps is not needed * for the credentials */ @@ -411,16 +421,7 @@ struct job_record { * allocated for this job */ uint32_t *used_lps; /* number of logical processors * already allocated to job steps */ - char *licenses; /* licenses required by the job */ - uint16_t mail_type; /* see MAIL_JOB_* in slurm.h */ - char *mail_user; /* user to get e-mail notification */ - uint32_t requid; /* requester user ID */ - uint32_t exit_code; /* exit code for job (status from - * wait call) */ - uint16_t state_reason; /* reason job still pending or failed - * see slurm.h:enum job_wait_reason */ - uint32_t db_index; /* used only for database - plugins */ + }; /* Job dependency specification, used in "depend_list" within job_record */ -- GitLab