diff --git a/META b/META index af3fb46bef458f42f16d09aacdaf00c06ac98889..7db18821c5f3dd0a14fe4a0496e7f73821008c5e 100644 --- a/META +++ b/META @@ -11,7 +11,7 @@ Minor: 6 Micro: 0 Version: 0.6.0 - Release: 0.pre6 + Release: 0.pre8 API_CURRENT: 7 API_AGE: 4 API_REVISION: 0 diff --git a/NEWS b/NEWS index a2c53a069f5bf9cf7449ee5c3f43b4067ec39051..c81cbf648c78e50d5789d59b6bd6ab2b656a9429 100644 --- a/NEWS +++ b/NEWS @@ -10,12 +10,32 @@ documents those changes that are of interest to users and admins. -- made change to srun to start message thread before other threads to make sure localtime doesn't interfere. -* Changes in SLURM 0.6.0 -======================== +* Changes in SLURM 0.6.0-pre8 +============================= + -- Remove debugging xassert in switch/federation that were accidentally + committed + -- Make slurmd step manager retry slurm_container_destroy() indefinitely + instead of giving up after 30 seconds. If something prevents a job + step's processes from being killed, the job will be stuck in the + completing until the container destroy succeeds. + +* Changes in SLURM 0.6.0-pre7 +============================= + -- Disable localtime_r() calls from forked processes (semaphore set + in another pthread can deadlock calls to localtime_r made from + the forked process, this will be properly fixed in the next + major release of SLURM). -- Added SLURM_LOCALID environment variable for spawned tasks (Dan Palermo, HP). -- Modify switch logic to restore state based exclusively upon recovered job steps (not state save file). + -- Gracefully refuse job if there are too many job steps in slurmd. + -- Fix race condition in job completion that can leave nodes in + COMPLETING state after job is COMPLETED. + -- Added frees for BGL BrigeAPI strdups that were to this point unknown. + -- smap scrolls correctly for BGL systems. + -- slurm_pid2jobid() API call will now return the jobid for a step + manager slurmd process. * Changes in SLURM 0.6.0-pre6 ============================= diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1 index 9f31857435705a81c61da5246c53371098a6be61..264acb9b0cdaecffe1921f0f79732373e70bcb0b 100644 --- a/doc/man/man1/scontrol.1 +++ b/doc/man/man1/scontrol.1 @@ -98,7 +98,8 @@ Print information one line per record. .TP \fIpidinfo\fP \fIPROC_ID\fP Print the Slurm job id and scheduled termination time corresponding to the -supplied process id, \fIPROC_ID\fP, on the current node. +supplied process id, \fIPROC_ID\fP, on the current node. This will only +work for processes which Slurm spawns and their descendants. .TP \fIping\fP Ping the primary and secondary slurmctld daemon and report if diff --git a/doc/man/man1/smap.1 b/doc/man/man1/smap.1 index 4f0d8ce8c65c24cfb88f38e67342f4250641bee3..e4a16c4c1f4f1dfd4b3c56bde8f9845c7b7b34d8 100644 --- a/doc/man/man1/smap.1 +++ b/doc/man/man1/smap.1 @@ -1,4 +1,4 @@ -.TH SMAP "1" "March 2005" "smap 0.6" "Slurm components" +.TH SMAP "1" "September 2005" "smap 0.6" "Slurm components" .SH "NAME" smap \- graphically view information about SLURM jobs, partitions, and set @@ -237,18 +237,6 @@ The default value is FALSE. .I \fBRotate\fR Equivalent to "Rotation=true". .TP -.I \fBMode = COPROC | VIRTUAL\fR -Specify how the second processor on a c-node should be used. -To use it for computation, specify VIRTUAL. -To use it for communications, specify COPROC. -The default value is COPROC. -.TP -.I \fBCoproc\fR -Equivalent to "Mode=Coproc". -.TP -.I \fBVirtual\fR -Equivalent to "Mode=Virtual". -.TP .I \fBElongation = TRUE | FALSE\fR If TRUE, permit the geometry specified in the size parameter to be altered as needed to fit available resources. diff --git a/doc/man/man3/slurm_free_job_info_msg.3 b/doc/man/man3/slurm_free_job_info_msg.3 index 5418ac166aeb8d160eaeeaaf6df69f9c4e48de7e..1f8fd1363236bb98ea212c18c4651455dbac7e29 100644 --- a/doc/man/man3/slurm_free_job_info_msg.3 +++ b/doc/man/man3/slurm_free_job_info_msg.3 @@ -154,7 +154,8 @@ time limit. record count, and array of job_table records for all jobs. .LP \fBslurm_pid2jobid\fR Returns a Slurm job id corresponding to the supplied -local process id. +local process id. This only works for processes which Slurm spawns and their +descendants. .LP \fBslurm_print_job_info\fR Prints the contents of the data structure describing a single job records from the data loaded by the diff --git a/src/common/macros.h b/src/common/macros.h index 988272604cde4935d0fbcb8a5d70cc87a94be4aa..c6d6aa1a99491bd9de403c5e2b8c1955f7858f8a 100644 --- a/src/common/macros.h +++ b/src/common/macros.h @@ -236,4 +236,15 @@ typedef enum {false, true} bool; # endif #endif +/* localtime() can't be used after a fork due to possibly set semaphore. + * until we modify slurmd and srun to exec immediately after the fork, + * we disable localtime(). This is a temporary patch. */ +#define DISABLE_LOCALTIME 1 +#include "src/common/xstring.h" +#include "src/common/slurm_cred.h" +#define disable_localtime() \ + _STMT_START { \ + _xstrftimecat(NULL, NULL); \ + timestr(NULL,NULL,0); \ + } _STMT_END #endif /* !_MACROS_H */ diff --git a/src/common/parse_time.c b/src/common/parse_time.c index 3634109e845d7b06fbd28d55c784a80b7f77a39e..eef7bd343845fc70471f268ca4f424775a98bd41 100644 --- a/src/common/parse_time.c +++ b/src/common/parse_time.c @@ -210,6 +210,7 @@ static int _get_date(char *time_str, int *pos, int *month, int *mday, int *year) * now + count [minutes | hours | days | weeks] * * Invalid input results in message to stderr and return value of zero + * NOTE: not thread safe */ extern time_t parse_time(char *time_str) { diff --git a/src/common/slurm_cred.c b/src/common/slurm_cred.c index 9cca8a8246fe04d1f5db627ce99c7917d90d3b6c..92071daced55c41976f4152c753b731be00fb9d8 100644 --- a/src/common/slurm_cred.c +++ b/src/common/slurm_cred.c @@ -185,8 +185,9 @@ static void _cred_state_pack(slurm_cred_ctx_t ctx, Buf buffer); static void _job_state_pack_one(job_state_t *j, Buf buffer); static void _cred_state_pack_one(cred_state_t *s, Buf buffer); +#ifndef DISABLE_LOCALTIME static char * timestr (const time_t *tp, char *buf, size_t n); - +#endif slurm_cred_ctx_t slurm_cred_creator_ctx_create(const char *path) @@ -1103,10 +1104,19 @@ _credential_replayed(slurm_cred_ctx_t ctx, slurm_cred_t cred) return false; } +#ifdef DISABLE_LOCALTIME +extern char * timestr (const time_t *tp, char *buf, size_t n) +#else static char * timestr (const time_t *tp, char *buf, size_t n) +#endif { char fmt[] = "%y%m%d%H%M%S"; struct tm tmval; +#ifdef DISABLE_LOCALTIME + static int disabled = 0; + if (buf == NULL) disabled=1; + if (disabled) return NULL; +#endif if (!localtime_r (tp, &tmval)) error ("localtime: %m"); strftime (buf, n, fmt, &tmval); diff --git a/src/common/slurm_cred.h b/src/common/slurm_cred.h index fdfc913d0985c798dd0d6c28935d8295347a145f..65b96c21d19317a332f08188ba55262ddcf74b23 100644 --- a/src/common/slurm_cred.h +++ b/src/common/slurm_cred.h @@ -1,6 +1,6 @@ /*****************************************************************************\ - * src/common/slurm_cred.h - SLURM job credential operations - * $Id$ + * src/common/slurm_cred.h - SLURM job credential operations + * $Id$ ***************************************************************************** * Copyright (C) 2002 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -226,5 +226,7 @@ int slurm_cred_get_signature(slurm_cred_t cred, char **datap, int *len); */ void slurm_cred_print(slurm_cred_t cred); - +#ifdef DISABLE_LOCALTIME +extern char * timestr (const time_t *tp, char *buf, size_t n); +#endif #endif /* _HAVE_SLURM_CREDS_H */ diff --git a/src/common/xstring.c b/src/common/xstring.c index e480459ff06cd68c189d4e673aeb52713b6df240..a7d1f20b4f216ccfaa31354307270e20cf07ba3b 100644 --- a/src/common/xstring.c +++ b/src/common/xstring.c @@ -149,10 +149,14 @@ void _xstrftimecat(char **buf, const char *fmt) { char p[256]; /* output truncated to 256 chars */ time_t t; - struct tm *tm_ptr = NULL; - static pthread_mutex_t localtime_lock = PTHREAD_MUTEX_INITIALIZER; + struct tm tm; const char default_fmt[] = "%m/%d/%Y %H:%M:%S %Z"; +#ifdef DISABLE_LOCALTIME + static int disabled=0; + if (!buf) disabled=1; + if (disabled) return; +#endif if (fmt == NULL) fmt = default_fmt; @@ -160,13 +164,11 @@ void _xstrftimecat(char **buf, const char *fmt) if (time(&t) == (time_t) -1) fprintf(stderr, "time() failed\n"); - pthread_mutex_lock(&localtime_lock); - if (!(tm_ptr = localtime(&t))) - fprintf(stderr, "localtime() failed\n"); + if (!localtime_r(&t, &tm)) + fprintf(stderr, "localtime_r() failed\n"); - strftime(p, sizeof(p), fmt, tm_ptr); + strftime(p, sizeof(p), fmt, &tm); - pthread_mutex_unlock(&localtime_lock); _xstrcat(buf, p); } diff --git a/src/partition_allocator/partition_allocator.c b/src/partition_allocator/partition_allocator.c index 38c5a524a7372722e48d9a56ae96a2504d0e3c69..766ca7818c32cf06c8dbd2eb20893666edf7e313 100644 --- a/src/partition_allocator/partition_allocator.c +++ b/src/partition_allocator/partition_allocator.c @@ -520,7 +520,6 @@ extern void print_pa_request(pa_request_t* pa_request) debug(" rotate:\t%d", pa_request->rotate); debug(" elongate:\t%d", pa_request->elongate); debug("force contig:\t%d", pa_request->force_contig); - debug(" node_use:\t%d", pa_request->node_use); } /** @@ -1810,7 +1809,12 @@ extern int set_bp_map(void) error("rm_get_data(RM_BPID): %d", rc); continue; } - + + if(!bp_id) { + error("No BP ID was returned from database"); + continue; + } + if ((rc = rm_get_data(my_bp, RM_BPLoc, &bp_loc)) != STATUS_OK) { xfree(bp_map); @@ -1833,11 +1837,11 @@ extern int set_bp_map(void) list_push(bp_map_list, bp_map); + free(bp_id); } if ((rc = rm_free_BGL(bgl)) != STATUS_OK) - error("rm_free_BGL(): %s", rc); - + error("rm_free_BGL(): %s", rc); #endif _bp_map_initialized = true; @@ -2146,6 +2150,12 @@ static int _set_external_wires(int dim, int count, pa_node_t* source, error("rm_get_data(RM_FirstWire): %d", rc); break; } + + if(!wire_id) { + error("No Wire ID was returned from database"); + continue; + } + if(wire_id[7] != '_') continue; switch(wire_id[0]) { @@ -2165,6 +2175,9 @@ static int _set_external_wires(int dim, int count, pa_node_t* source, } strncpy(from_node, wire_id+2, 4); strncpy(to_node, wire_id+8, 4); + + free(wire_id); + from_node[4] = '\0'; to_node[4] = '\0'; if ((rc = rm_get_data(my_wire, RM_WireFromPort, &my_port)) @@ -2177,7 +2190,6 @@ static int _set_external_wires(int dim, int count, pa_node_t* source, error("rm_get_data(RM_PortID): %d", rc); break; } - if ((rc = rm_get_data(my_wire, RM_WireToPort, &my_port)) != STATUS_OK) { error("rm_get_data(RM_WireToPort): %d", rc); @@ -2188,6 +2200,7 @@ static int _set_external_wires(int dim, int count, pa_node_t* source, error("rm_get_data(RM_PortID): %d", rc); break; } + coord = find_bp_loc(from_node); if(coord[X]>=DIM_SIZE[X] || coord[Y]>=DIM_SIZE[Y] diff --git a/src/partition_allocator/partition_allocator.h b/src/partition_allocator/partition_allocator.h index cfa7d75111a4eac3da2e6586498233bdf47f1b97..ac94e2ec151881db7334d84f9deb6430a394236d 100644 --- a/src/partition_allocator/partition_allocator.h +++ b/src/partition_allocator/partition_allocator.h @@ -121,7 +121,6 @@ typedef struct { int conn_type; int rotate_count; int elongate_count; - int node_use; bool rotate; bool elongate; bool force_contig; diff --git a/src/plugins/select/bluegene/bgl_job_run.c b/src/plugins/select/bluegene/bgl_job_run.c index 7f7572ddb2bf42948710a5091714e9ef8d37e4e3..22fc9d25828788fe500844e61f4d34cc75bf610d 100644 --- a/src/plugins/select/bluegene/bgl_job_run.c +++ b/src/plugins/select/bluegene/bgl_job_run.c @@ -396,10 +396,22 @@ static void _term_agent(bgl_update_t *bgl_update_ptr) part_id, bgl_err_str(rc)); continue; } + + if(!part_id) { + error("No partitionID returned from Database"); + continue; + } + debug2("looking at partition %s looking for %s\n", part_id, bgl_update_ptr->bgl_part_id); - if (strcmp(part_id, bgl_update_ptr->bgl_part_id) != 0) + + if (strcmp(part_id, bgl_update_ptr->bgl_part_id) != 0) { + free(part_id); continue; + } + + free(part_id); + if ((rc = rm_get_data(job_elem, RM_JobDBJobID, &job_id)) != STATUS_OK) { error("rm_get_data(RM_JobDBJobID): %s", diff --git a/src/plugins/select/bluegene/bgl_part_info.c b/src/plugins/select/bluegene/bgl_part_info.c index 33781d606160c5c38eb0f7f87639cab6aa60a983..ec481ffe47fa1765841beae1671224488cf9b2b9 100644 --- a/src/plugins/select/bluegene/bgl_part_info.c +++ b/src/plugins/select/bluegene/bgl_part_info.c @@ -223,9 +223,14 @@ extern int update_partition_list() updated = -1; break; } - if(strncmp("RMP", name,3)) + if(!name) { + error("No Partition ID was returned from database"); continue; - + } + if(strncmp("RMP", name, 3)) { + free(name); + continue; + } bgl_record = find_bgl_record(name); if(bgl_record == NULL) { @@ -235,7 +240,9 @@ extern int update_partition_list() if ((rc = pm_destroy_partition(name)) != STATUS_OK) { if(rc == PARTITION_NOT_FOUND) { - debug("partition %s is not found"); + debug("partition %s is not found", + name); + free(name); break; } error("pm_destroy_partition(%s): %s", @@ -249,9 +256,11 @@ extern int update_partition_list() bgl_err_str(rc)); } else debug("done\n"); + free(name); continue; } - + free(name); + slurm_mutex_lock(&part_state_mutex); if ((rc = rm_get_data(part_ptr, RM_PartitionMode, &node_use)) diff --git a/src/plugins/select/bluegene/bgl_switch_connections.c b/src/plugins/select/bluegene/bgl_switch_connections.c index 74379b5531cb5abf725c8b85bbf9cb01a8f69d2d..c0c49c98d038d3c8e72c39133ebcd5567277bccc 100644 --- a/src/plugins/select/bluegene/bgl_switch_connections.c +++ b/src/plugins/select/bluegene/bgl_switch_connections.c @@ -1,6 +1,8 @@ /*****************************************************************************\ * bgl_switch_connections.c - Blue Gene switch management functions, * establish switch connections + * + * $Id$ ***************************************************************************** * Copyright (C) 2004 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -454,12 +456,18 @@ extern int configure_partition_switches(bgl_record_t * bgl_record) rc = SLURM_ERROR; goto cleanup; } - + + if(!bpid) { + error("No BP ID was returned from database"); + continue; + } + found_bpid = 0; for (i=0; i<switch_count; i++) { if(i) { if ((rc = rm_get_data(bgl, RM_NextSwitch, - &curr_switch)) != STATUS_OK) { + &curr_switch)) + != STATUS_OK) { fatal("rm_get_data: RM_NextSwitch: %s", bgl_err_str(rc)); list_iterator_destroy(bgl_itr); @@ -468,7 +476,8 @@ extern int configure_partition_switches(bgl_record_t * bgl_record) } } else { if ((rc = rm_get_data(bgl, RM_FirstSwitch, - &curr_switch)) != STATUS_OK) { + &curr_switch)) + != STATUS_OK) { fatal("rm_get_data: " "RM_FirstSwitch: %s", bgl_err_str(rc)); @@ -485,15 +494,25 @@ extern int configure_partition_switches(bgl_record_t * bgl_record) rc = SLURM_ERROR; goto cleanup; } - + + if(!curr_bpid) { + error("No BP ID was returned from database"); + continue; + } + if (!strcasecmp((char *)bpid, (char *)curr_bpid)) { coord_switch[found_bpid] = curr_switch; found_bpid++; - if(found_bpid==PA_SYSTEM_DIMENSIONS) + if(found_bpid==PA_SYSTEM_DIMENSIONS) { + free(curr_bpid); break; - } + } + } + free(curr_bpid); } - + + free(bpid); + if(found_bpid==PA_SYSTEM_DIMENSIONS) { debug2("adding midplane %d%d%d\n", @@ -503,17 +522,17 @@ extern int configure_partition_switches(bgl_record_t * bgl_record) switch_itr = list_iterator_create(bgl_bp->switch_list); while((bgl_switch = list_next(switch_itr)) != NULL) { - if ((rc = rm_get_data(coord_switch - [bgl_switch->dim], - RM_SwitchID,&name2)) - != STATUS_OK) { - fatal("rm_get_data: RM_SwitchID: %s", - bgl_err_str(rc)); - list_iterator_destroy(bgl_itr); - list_iterator_destroy(switch_itr); - rc = SLURM_ERROR; - goto cleanup; - } + /* if ((rc = rm_get_data(coord_switch */ +/* [bgl_switch->dim], */ +/* RM_SwitchID,&name2)) */ +/* != STATUS_OK) { */ +/* fatal("rm_get_data: RM_SwitchID: %s",*/ +/* bgl_err_str(rc)); */ +/* list_iterator_destroy(bgl_itr); */ +/* list_iterator_destroy(switch_itr); */ +/* rc = SLURM_ERROR; */ +/* goto cleanup; */ +/* } */ debug2("adding switch dim %d\n", bgl_switch->dim); diff --git a/src/plugins/select/bluegene/bluegene.c b/src/plugins/select/bluegene/bluegene.c index cfc245cc6317f39603f19cb5ddf093ef3af440dd..705e7e73c401c07857e9aa0a306a484de40378a7 100644 --- a/src/plugins/select/bluegene/bluegene.c +++ b/src/plugins/select/bluegene/bluegene.c @@ -110,6 +110,8 @@ extern int init_bgl(void) /* Purge all plugin variables */ extern void fini_bgl(void) { + int rc; + _set_bgl_lists(); if (bgl_list) { @@ -135,7 +137,8 @@ extern void fini_bgl(void) #ifdef HAVE_BGL_FILES if(bgl) - rm_free_BGL(bgl); + if ((rc = rm_free_BGL(bgl)) != STATUS_OK) + error("rm_free_BGL(): %s", bgl_err_str(rc)); #endif pa_fini(); } @@ -318,20 +321,26 @@ extern int remove_all_users(char *bgl_part_id, char *user_name) RM_PartitionFirstUser, &user)) != STATUS_OK) { - error("rm_get_partition(%s): %s", + error("rm_get_data(%s): %s", bgl_part_id, bgl_err_str(rc)); returnc = REMOVE_USER_ERR; break; } } - - if(!strcmp(user, slurmctld_conf.slurm_user_name)) + if(!user) { + error("No user was returned from database"); continue; - + } + if(!strcmp(user, slurmctld_conf.slurm_user_name)) { + free(user); + continue; + } + if(user_name) { if(!strcmp(user, user_name)) { returnc = REMOVE_USER_FOUND; + free(user); continue; } } @@ -344,7 +353,8 @@ extern int remove_all_users(char *bgl_part_id, char *user_name) debug("user %s isn't on partition %s", user, bgl_part_id); - } + } + free(user); } if ((rc = rm_free_partition(part_ptr)) != STATUS_OK) { error("rm_free_partition(): %s", bgl_err_str(rc)); @@ -759,9 +769,10 @@ extern int bgl_free_partition(bgl_record_t *bgl_record) debug("partition %s is not found"); break; } - error("pm_destroy_partition(%s): %s", + error("pm_destroy_partition(%s): %s " + "State = %d", bgl_record->bgl_part_id, - bgl_err_str(rc)); + bgl_err_str(rc), bgl_record->state); } } @@ -1615,7 +1626,6 @@ static int _update_bgl_record_state(List bgl_destroy_list) rm_partition_t *part_ptr = NULL; ListIterator itr; bgl_record_t* bgl_record = NULL; - int found = 0; if(!bgl_destroy_list) { return SLURM_SUCCESS; @@ -1665,18 +1675,19 @@ static int _update_bgl_record_state(List bgl_destroy_list) func_rc = SLURM_ERROR; break; } - found = 0; + if (!name) { + error("RM_Partition is NULL"); + continue; + } + itr = list_iterator_create(bgl_destroy_list); while ((bgl_record = (bgl_record_t*) list_next(itr))) { - if(bgl_record->bgl_part_id) - if(!strcmp(bgl_record->bgl_part_id, name)) { - found = 1; - break; - } - } - list_iterator_destroy(itr); + if(!bgl_record->bgl_part_id) + continue; + if(strcmp(bgl_record->bgl_part_id, name)) { + continue; + } - if(found) { slurm_mutex_lock(&part_state_mutex); if ((rc = rm_get_data(part_ptr, RM_PartitionState, @@ -1690,11 +1701,13 @@ static int _update_bgl_record_state(List bgl_destroy_list) name, bgl_record->state, state); bgl_record->state = state; } - slurm_mutex_unlock(&part_state_mutex); + slurm_mutex_unlock(&part_state_mutex); + break; } + list_iterator_destroy(itr); + free(name); } -clean_up: if ((rc = rm_free_partition_list(part_list)) != STATUS_OK) { error("rm_free_partition_list(): %s", bgl_err_str(rc)); } diff --git a/src/plugins/select/bluegene/partition_sys.c b/src/plugins/select/bluegene/partition_sys.c index 6aa153538ca62cfff55a8d94b6c624177e43f307..d92826568d2b1827f030205d94520370b7d6c334 100755 --- a/src/plugins/select/bluegene/partition_sys.c +++ b/src/plugins/select/bluegene/partition_sys.c @@ -167,8 +167,14 @@ static int _post_allocate(bgl_record_t *bgl_record) error("rm_get_data(RM_PartitionID): %s", bgl_err_str(rc)); bgl_record->bgl_part_id = xstrdup("UNKNOWN"); } else { + if(!part_id) { + error("No Partition ID was returned from database"); + return SLURM_ERROR; + } bgl_record->bgl_part_id = xstrdup(part_id); + free(part_id); + xfree(bgl_record->target_name); bgl_record->target_name = xstrdup(slurmctld_conf.slurm_user_name); @@ -268,42 +274,53 @@ int read_bgl_partitions() bgl_err_str(rc)); continue; } - if(strncmp("RMP",part_name,3)) + + if(!part_name) { + error("No Partition ID was returned from database"); continue; - + } + + if(strncmp("RMP", part_name, 3)) { + free(part_name); + continue; + } if(bgl_recover) if ((rc = rm_get_partition(part_name, &part_ptr)) != STATUS_OK) { error("Partition %s doesn't exist.", part_name); rc = SLURM_ERROR; + free(part_name); break; } - /* New BGL partition record */ bgl_record = xmalloc(sizeof(bgl_record_t)); list_push(bgl_curr_part_list, bgl_record); bgl_record->bgl_part_id = xstrdup(part_name); + + free(part_name); + bgl_record->state = -1; if ((rc = rm_get_data(part_ptr, RM_PartitionBPNum, &bp_cnt)) - != STATUS_OK) { + != STATUS_OK) { error("rm_get_data(RM_BPNum): %s", bgl_err_str(rc)); bp_cnt = 0; } if(bp_cnt==0) - continue; - + goto clean_up; + bgl_record->bgl_part_list = list_create(NULL); bgl_record->hostlist = hostlist_create(NULL); for (i=0; i<bp_cnt; i++) { if(i) { if ((rc = rm_get_data(part_ptr, - RM_PartitionNextBP, &bp_ptr)) - != STATUS_OK) { + RM_PartitionNextBP, + &bp_ptr)) + != STATUS_OK) { error("rm_get_data(RM_NextBP): %s", bgl_err_str(rc)); rc = SLURM_ERROR; @@ -317,6 +334,8 @@ int read_bgl_partitions() error("rm_get_data(RM_FirstBP): %s", bgl_err_str(rc)); rc = SLURM_ERROR; + if (bgl_recover) + rm_free_partition(part_ptr); return rc; } } @@ -327,9 +346,16 @@ int read_bgl_partitions() rc = SLURM_ERROR; break; } - + + if(!bpid) { + error("No BP ID was returned from database"); + continue; + } + coord = find_bp_loc(bpid); - + + free(bpid); + if(!coord) fatal("No contact with db2. Shutting down."); @@ -382,9 +408,19 @@ int read_bgl_partitions() xstrdup(slurmctld_conf. slurm_user_name); } else { - rm_get_data(part_ptr, RM_PartitionFirstUser, - &user_name); + user_name = NULL; + if ((rc = rm_get_data(part_ptr, RM_PartitionFirstUser, + &user_name)) != STATUS_OK) { + error("rm_get_data(RM_PartitionFirstUser): %s", + bgl_err_str(rc)); + } + if(!user_name) { + error("No user name was " + "returned from database"); + goto clean_up; + } bgl_record->user_name = xstrdup(user_name); + if(!bgl_record->boot_state) bgl_record->target_name = xstrdup(slurmctld_conf. @@ -392,6 +428,8 @@ int read_bgl_partitions() else bgl_record->target_name = xstrdup(user_name); + + free(user_name); } if((pw_ent = getpwnam(bgl_record->user_name)) @@ -419,8 +457,8 @@ int read_bgl_partitions() bgl_record->part_lifecycle = STATIC; - if ((rc = rm_free_partition(part_ptr)) - != STATUS_OK) { +clean_up: if (bgl_recover + && ((rc = rm_free_partition(part_ptr)) != STATUS_OK)) { error("rm_free_partition(): %s", bgl_err_str(rc)); } } diff --git a/src/plugins/select/bluegene/sfree.c b/src/plugins/select/bluegene/sfree.c index 9d3373983cde04494f68edc370f0a9b5969b8220..e1726086ab942047a65e7b25f49805879d463da0 100644 --- a/src/plugins/select/bluegene/sfree.c +++ b/src/plugins/select/bluegene/sfree.c @@ -205,11 +205,22 @@ int main(int argc, char *argv[]) break; } - if(strncmp("RMP", bgl_part_id, 3)) + + if(!bgl_part_id) { + error("No Part ID was returned from database"); continue; + } + if(strncmp("RMP", bgl_part_id, 3)) { + free(bgl_part_id); + continue; + } + delete_record = xmalloc(sizeof(delete_record_t)); delete_record->bgl_part_id = xstrdup(bgl_part_id); + + free(bgl_part_id); + delete_record->state = -1; list_push(delete_record_list, delete_record); @@ -297,7 +308,6 @@ static int _update_bgl_record_state() int j, rc, i, num_parts = 0; rm_partition_state_t state = -2; rm_partition_t *part_ptr = NULL; - int found=0; delete_record_t *delete_record = NULL; ListIterator itr; @@ -345,20 +355,21 @@ static int _update_bgl_record_state() state = -1; break; } - found = 0; + if(!name) { + error("No Partition ID was returned from database"); + continue; + } + itr = list_iterator_create(delete_record_list); while ((delete_record = (delete_record_t*) list_next(itr))) { - if(delete_record->bgl_part_id) - if(!strcmp(delete_record->bgl_part_id, name)) { - found = 1; - break; - } - } - list_iterator_destroy(itr); + if(!delete_record->bgl_part_id) + continue; + if(strcmp(delete_record->bgl_part_id, name)) { + continue; + } - if(found) { if(state == -1) goto clean_up; else if(j>=num_parts) { @@ -377,7 +388,10 @@ static int _update_bgl_record_state() "(RM_PartitionState): %s", _bgl_err_str(rc)); } + break; } + list_iterator_destroy(itr); + free(name); } clean_up: if ((rc = rm_free_partition_list(part_list)) != STATUS_OK) { @@ -439,9 +453,17 @@ static void _term_jobs_on_part(char *bgl_part_id) part_id, _bgl_err_str(rc)); continue; } - - if (strcmp(part_id, bgl_part_id) != 0) + + if(!part_id) { + error("No Partition ID was returned from database"); continue; + } + + if (strcmp(part_id, bgl_part_id) != 0) { + free(part_id); + continue; + } + free(part_id); job_found = 1; if ((rc = rm_get_data(job_elem, RM_JobDBJobID, &job_id)) != STATUS_OK) { @@ -450,9 +472,9 @@ static void _term_jobs_on_part(char *bgl_part_id) continue; } info("got job_id %d",job_id); - if((rc = _remove_job(job_id)) == INTERNAL_ERROR) + if((rc = _remove_job(job_id)) == INTERNAL_ERROR) { goto not_removed; - + } } if(job_found == 0) info("No jobs on bglblock %s", bgl_part_id); diff --git a/src/plugins/select/bluegene/state_test.c b/src/plugins/select/bluegene/state_test.c index 807cc8f0000dfa210a553fc6f1bb8194517e3ae4..c49284c2202a52c5cd40836a837749dc396e7587 100644 --- a/src/plugins/select/bluegene/state_test.c +++ b/src/plugins/select/bluegene/state_test.c @@ -101,8 +101,18 @@ static void _configure_node_down(rm_bp_id_t bp_id, rm_BGL_t *bgl) error("rm_get_data(RM_BPID): %s", bgl_err_str(rc)); continue; } - if (strcmp(bp_id, bpid) != 0) /* different base partition */ + + if(!bpid) { + error("No BPID was returned from database"); continue; + } + + if (strcmp(bp_id, bpid) != 0) { /* different base partition */ + free(bpid); + continue; + } + free(bpid); + if ((rc = rm_get_data(my_bp, RM_BPState, &bp_state)) != STATUS_OK) { error("rm_get_data(RM_BPState): %s", bgl_err_str(rc)); @@ -277,7 +287,14 @@ static void _test_down_switches(rm_BGL_t *bgl) bgl_err_str(rc)); continue; } + + if(!bp_id) { + error("No BPID was returned from database"); + continue; + } + _configure_node_down(bp_id, bgl); + free(bp_id); } } #endif diff --git a/src/plugins/switch/federation/federation.c b/src/plugins/switch/federation/federation.c index 206831578cf6b864b7d1fb97e1ec282027f9c3f2..3e4e831516f210b49b0a019741e51f0071d17175 100644 --- a/src/plugins/switch/federation/federation.c +++ b/src/plugins/switch/federation/federation.c @@ -1522,10 +1522,6 @@ _job_step_window_state(fed_jobinfo_t *jp, hostlist_t hl, enum NTBL_RC state) || (hostlist_is_empty(hl))) return SLURM_ERROR; - xassert(jp->tables_per_task); - xassert(jp->tableinfo); - xassert(jp->tableinfo[0].table_length); - if ((jp->tables_per_task == 0) || !jp->tableinfo || (jp->tableinfo[0].table_length == 0)) diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 3d74fc4f3edccfe7453e40d9c99e4e641fee98b1..8c5f7ff11a1d8bd014d553912e25b94276f9f30e 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -1576,6 +1576,9 @@ extern int job_complete(uint32_t job_id, uid_t uid, bool requeue, return ESLURM_USER_ID_MISSING; } + if (job_ptr->job_state & JOB_COMPLETING) + return SLURM_SUCCESS; /* avoid replay */ + if (job_ptr->job_state == JOB_RUNNING) job_comp_flag = JOB_COMPLETING; diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 8184e9cc41fd0e1c96c44097ed332663c22d6cd8..7e9a1e7d733931cf9e5f624f8e7a70a0e00547db 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -1356,7 +1356,7 @@ extern int validate_nodes_via_front_end(uint32_t job_count, hostlist_uniq(reg_hostlist); hostlist_ranged_string(reg_hostlist, sizeof(host_str), host_str); - debug("Nodes %s have registerd", host_str); + debug("Nodes %s have registered", host_str); hostlist_destroy(reg_hostlist); } if (return_hostlist) { diff --git a/src/slurmd/mgr.c b/src/slurmd/mgr.c index 241f4260bb38eaf661b3d3ab337e7547dad18cd1..b36a561061982a88452758a1d89cde9134d1c8e2 100644 --- a/src/slurmd/mgr.c +++ b/src/slurmd/mgr.c @@ -866,7 +866,7 @@ _kill_running_tasks(slurmd_job_t *job) List steps; ListIterator i; job_step_t *s = NULL; - int limit = 0; + int delay = 1; if (job->batch) return; @@ -884,12 +884,16 @@ _kill_running_tasks(slurmd_job_t *job) if (s->cont_id) { slurm_container_signal(s->cont_id, SIGKILL); - /* Try destroying the container up to 30 times */ - while (slurm_container_destroy(s->cont_id) != SLURM_SUCCESS - && limit < 30) { + /* Spin until the container is successfully destroyed */ + while (slurm_container_destroy(s->cont_id) != SLURM_SUCCESS) { slurm_container_signal(s->cont_id, SIGKILL); - sleep(1); - limit++; + sleep(delay); + if (delay < 120) { + delay *= 2; + } else { + error("Unable to destroy container, job %u.%u", + job->jobid, job->stepid); + } } } diff --git a/src/slurmd/proctrack.c b/src/slurmd/proctrack.c index d07aaafcd075b69ce78fce9ace60b38cad78521b..bc2efea7cd89373ada1f8676a4238b614830130d 100644 --- a/src/slurmd/proctrack.c +++ b/src/slurmd/proctrack.c @@ -188,7 +188,8 @@ slurm_proctrack_init( void ) } if ( _proctrack_get_ops( g_proctrack_context ) == NULL ) { - error( "cannot resolve proctrack plugin operations" ); + error( "cannot resolve proctrack plugin operations for %s", + proctrack_type ); _proctrack_context_destroy( g_proctrack_context ); g_proctrack_context = NULL; retval = SLURM_ERROR; diff --git a/src/slurmd/req.c b/src/slurmd/req.c index 4e5b0f4207228fd4af88c7acb6b6d36d99a2d4e8..160126112b51e584d9bf9274b437ac31130011cf 100644 --- a/src/slurmd/req.c +++ b/src/slurmd/req.c @@ -244,6 +244,9 @@ _fork_new_slurmd(void) return ((int) pid); } +#ifdef DISABLE_LOCALTIME + disable_localtime(); +#endif if (close(fds[0]) < 0) error("Unable to close read-pipe in child: %m"); @@ -456,6 +459,14 @@ _rpc_launch_tasks(slurm_msg_t *msg, slurm_addr *cli) goto done; } + /* Make an effort to not overflow shm records */ + if (shm_free_steps() < 2) { + errnum = ESLURMD_TOOMANYSTEPS; + error("reject task %u.%u, too many steps", req->job_id, + req->job_step_id); + goto done; + } + /* xassert(slurm_cred_jobid_cached(conf->vctx, req->job_id));*/ /* Run job prolog if necessary */ @@ -517,6 +528,14 @@ _rpc_spawn_task(slurm_msg_t *msg, slurm_addr *cli) goto done; } + /* Make an effort to not overflow shm records */ + if (shm_free_steps() < 2) { + errnum = ESLURMD_TOOMANYSTEPS; + error("reject task %u.%u, too many steps", req->job_id, + req->job_step_id); + goto done; + } + slurmd_get_addr(cli, &port, host, sizeof(host)); info("spawn task %u.%u request from %u@%s", req->job_id, req->job_step_id, req->uid, host); @@ -614,6 +633,14 @@ _rpc_batch_job(slurm_msg_t *msg, slurm_addr *cli) goto done; } + /* Make an effort to not overflow shm records */ + if (shm_free_steps() < 2) { + rc = ESLURMD_TOOMANYSTEPS; + error("reject job %u, too many steps", req->job_id); + _prolog_error(req, rc); + goto done; + } + if (req->step_id != NO_VAL && req->step_id != 0) first_job_run = false; @@ -875,15 +902,27 @@ static void _rpc_pid2jid(slurm_msg_t *msg, slurm_addr *cli) job_id_response_msg_t resp; bool found = false; uint32_t my_cont = slurm_container_find(req->job_pid); + List steps = shm_get_steps(); + ListIterator i = list_iterator_create(steps); + job_step_t *s = NULL; if (my_cont == 0) { - verbose("slurm_container_find(%u): process not found", - (uint32_t) req->job_pid); + debug("slurm_container_find(%u): process not found", + (uint32_t) req->job_pid); + /* + * Check if the job_pid matches the pid of a job step slurmd. + * LCRM gets confused if a session leader process + * (the job step slurmd) is not labelled as a process in the + * job step. + */ + while ((s = list_next(i))) { + if (s->mpid == req->job_pid) { + resp.job_id = s->jobid; + found = true; + break; + } + } } else { - List steps = shm_get_steps(); - ListIterator i = list_iterator_create(steps); - job_step_t *s = NULL; - while ((s = list_next(i))) { if (s->cont_id == my_cont) { resp.job_id = s->jobid; @@ -891,9 +930,9 @@ static void _rpc_pid2jid(slurm_msg_t *msg, slurm_addr *cli) break; } } - list_iterator_destroy(i); - list_destroy(steps); } + list_iterator_destroy(i); + list_destroy(steps); if (found) { resp_msg.address = msg->address; diff --git a/src/slurmd/shm.c b/src/slurmd/shm.c index 7d7c7c47c0d9c9f24ce3b5b8a86965dfe93d60d5..ec807bba79bed2015a75ef4b61623e50e21ecf4e 100644 --- a/src/slurmd/shm.c +++ b/src/slurmd/shm.c @@ -1,6 +1,6 @@ /*****************************************************************************\ - * src/slurmd/shm.c - slurmd shared memory routines - * $Id$ + * src/slurmd/shm.c - slurmd shared memory routines + * $Id$ ***************************************************************************** * Copyright (C) 2002 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -269,6 +269,20 @@ shm_get_steps(void) return l; } +extern int +shm_free_steps(void) +{ + int i, rc = 0; + + xassert(slurmd_shm != NULL); + _shm_lock(); + for (i = 0; i < MAX_JOB_STEPS; i++) { + if (slurmd_shm->step[i].state == SLURMD_JOB_UNUSED) + rc++; + } + _shm_unlock(); + return rc; +} static bool _job_step_mgr_still_running(job_step_t *step) diff --git a/src/slurmd/shm.h b/src/slurmd/shm.h index ab7c4f581111184456485f647059ddb200325def..369f2568ffe72a5f6a45a23a10e11a9a279c2468 100644 --- a/src/slurmd/shm.h +++ b/src/slurmd/shm.h @@ -1,6 +1,6 @@ /*****************************************************************************\ - * src/slurmd/shm.h - shared memory routines for slurmd - * $Id$ + * src/slurmd/shm.h - shared memory routines for slurmd + * $Id$ ***************************************************************************** * Copyright (C) 2002 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -263,6 +263,10 @@ int shm_update_job_timelimit(uint32_t jobid, time_t newlim); */ int shm_update_step_timelimit(uint32_t jobid, uint32_t stepid, time_t newlim); +/* + * Return count of free shm records + */ +extern int shm_free_steps(void); /* * Return job step timelimit diff --git a/src/slurmd/slurmd.c b/src/slurmd/slurmd.c index b73bf5b31130f603f2fbfd99d652a94495c9907f..4633f16c43590a8e24e2603ebc49181c220c6ef7 100644 --- a/src/slurmd/slurmd.c +++ b/src/slurmd/slurmd.c @@ -657,7 +657,8 @@ _slurmd_init() */ _update_logging(); _print_conf(); - slurm_proctrack_init(); + if (slurm_proctrack_init() != SLURM_SUCCESS) + return SLURM_FAILURE; if (getrlimit(RLIMIT_NOFILE,&rlim) == 0) { rlim.rlim_cur = rlim.rlim_max; diff --git a/src/smap/configure_functions.c b/src/smap/configure_functions.c index 533d6a7904121531eaad30c6c7431ad127e331cb..ee95c3fdd7c16b71d46bac013bb187db1725fba7 100644 --- a/src/smap/configure_functions.c +++ b/src/smap/configure_functions.c @@ -108,7 +108,6 @@ static int _create_allocation(char *com, List allocated_partitions) request->rotate = false; request->elongate = false; request->force_contig = false; - request->node_use = -1; while(i<len) { @@ -521,7 +520,6 @@ static int _copy_allocation(char *com, List allocated_partitions) request->geometry[Z] = allocated_part->request->geometry[Z]; request->size = allocated_part->request->size; request->conn_type=allocated_part->request->conn_type; - request->node_use=allocated_part->request->node_use; request->rotate =allocated_part->request->rotate; request->elongate = allocated_part->request->elongate; request->force_contig = allocated_part->request->force_contig; @@ -615,24 +613,6 @@ static int _save_allocation(char *com, List allocated_partitions) else conn_type = "MESH"; - if(allocated_part->request->node_use != -1) { - if(allocated_part->request->node_use - == COPROCESSOR) - mode_type = "COPROCESSOR"; - else - mode_type = "VIRTUAL"; - - sprintf(save_string, "Nodes=%s Type=%s " - "Use=%s\n", - allocated_part->request->save_name, - conn_type, mode_type); - } else { - sprintf(save_string, "Nodes=%s " - "Type=%s\n", - allocated_part->request->save_name, - conn_type); - } - fputs (save_string,file_ptr); } fclose (file_ptr); @@ -646,15 +626,9 @@ static void _print_header_command(void) mvwprintw(pa_system_ptr->text_win, pa_system_ptr->ycord, pa_system_ptr->xcord, "ID"); pa_system_ptr->xcord += 4; - /* mvwprintw(pa_system_ptr->text_win, pa_system_ptr->ycord, */ -/* pa_system_ptr->xcord, "PARTITION"); */ -/* pa_system_ptr->xcord += 10; */ mvwprintw(pa_system_ptr->text_win, pa_system_ptr->ycord, pa_system_ptr->xcord, "TYPE"); pa_system_ptr->xcord += 7; - mvwprintw(pa_system_ptr->text_win, pa_system_ptr->ycord, - pa_system_ptr->xcord, "CONF"); - pa_system_ptr->xcord += 9; mvwprintw(pa_system_ptr->text_win, pa_system_ptr->ycord, pa_system_ptr->xcord, "CONTIG"); pa_system_ptr->xcord += 7; @@ -681,9 +655,6 @@ static void _print_text_command(allocated_part_t *allocated_part) mvwprintw(pa_system_ptr->text_win, pa_system_ptr->ycord, pa_system_ptr->xcord, "%c",allocated_part->letter); pa_system_ptr->xcord += 4; - /* mvwprintw(pa_system_ptr->text_win, pa_system_ptr->ycord, */ - /* pa_system_ptr->xcord, "PARTITION"); */ - /* pa_system_ptr->xcord += 10; */ if(allocated_part->request->conn_type==TORUS) mvwprintw(pa_system_ptr->text_win, pa_system_ptr->ycord, pa_system_ptr->xcord, "TORUS"); @@ -692,22 +663,6 @@ static void _print_text_command(allocated_part_t *allocated_part) pa_system_ptr->xcord, "MESH"); pa_system_ptr->xcord += 7; - if(allocated_part->request->node_use != -1) { - if(allocated_part->request->node_use == COPROCESSOR) - mvwprintw(pa_system_ptr->text_win, - pa_system_ptr->ycord, - pa_system_ptr->xcord, "coproc"); - else - mvwprintw(pa_system_ptr->text_win, - pa_system_ptr->ycord, - pa_system_ptr->xcord, "virtual"); - } else { - mvwprintw(pa_system_ptr->text_win, pa_system_ptr->ycord, - pa_system_ptr->xcord, "both"); - } - - pa_system_ptr->xcord += 9; - if(allocated_part->request->force_contig) mvwprintw(pa_system_ptr->text_win, pa_system_ptr->ycord, pa_system_ptr->xcord, "Y"); @@ -748,12 +703,14 @@ static void _print_text_command(allocated_part_t *allocated_part) void get_command(void) { char com[255]; - //static node_info_msg_t *node_info_ptr; + int text_width, text_startx; allocated_part_t *allocated_part = NULL; int i=0; + int count=0; + WINDOW *command_win; - List allocated_partitions; + List allocated_partitions; ListIterator results_i; if(params.commandline) { @@ -796,22 +753,29 @@ void get_command(void) pa_system_ptr->xcord++; } pa_system_ptr->ycord++; - pa_system_ptr->xcord=1; - memset(error_string,0,255); - + pa_system_ptr->xcord=1; + memset(error_string,0,255); } results_i = list_iterator_create(allocated_partitions); + + count = list_count(allocated_partitions) + - (LINES-(pa_system_ptr->ycord+5)); + + if(count<0) + count=0; + i=0; while((allocated_part = list_next(results_i)) != NULL) { - _print_text_command(allocated_part); + if(i>=count) + _print_text_command(allocated_part); + i++; } - list_iterator_destroy(results_i); - + list_iterator_destroy(results_i); wnoutrefresh(pa_system_ptr->text_win); wnoutrefresh(pa_system_ptr->grid_win); doupdate(); clear_window(command_win); - //wclear(command_win); + box(command_win, 0, 0); mvwprintw(command_win, 0, 3, "Input Command: (type quit to change view, " @@ -824,6 +788,8 @@ void get_command(void) _delete_allocated_parts(allocated_partitions); pa_fini(); exit(0); + } if (!strcmp(com, "quit")) { + break; } else if (!strncasecmp(com, "resolve", 7) || !strncasecmp(com, "r ", 2)) { _resolve(com); diff --git a/src/smap/partition_functions.c b/src/smap/partition_functions.c index 3dbd3f06303fd7e8f38decef844b3baf7707687b..35f4a2a70f471b10a8517d8a12876ade0574331b 100644 --- a/src/smap/partition_functions.c +++ b/src/smap/partition_functions.c @@ -69,7 +69,7 @@ static int _list_match_all(void *object, void *key); static int _in_slurm_partition(db2_block_info_t *db2_info_ptr, int *first, int *last); -static int _print_rest(db2_block_info_t *block_ptr, int *count); +static int _print_rest(db2_block_info_t *block_ptr); #endif extern void get_slurm_part() @@ -316,12 +316,12 @@ extern void get_bgl_part() list_next(itr)) != NULL) { if (params.commandline) block_ptr->printed = 1; - else + else { if(count>=text_line_cnt) block_ptr->printed = 1; - - _print_rest(block_ptr, &count); - + } + _print_rest(block_ptr); + count++; } list_iterator_destroy(itr); @@ -798,63 +798,13 @@ static int _in_slurm_partition(db2_block_info_t *db2_info_ptr, } -static int _print_rest(db2_block_info_t *block_ptr, int *count) +static int _print_rest(db2_block_info_t *block_ptr) { partition_info_t part; db2_block_info_t *db2_info_ptr = NULL; ListIterator itr; int set = 0; -/* part.total_nodes = 0; */ - -/* if (block_list) { */ -/* itr = list_iterator_create(block_list); */ -/* while ((db2_info_ptr = (db2_block_info_t*) list_next(itr)) */ -/* != NULL) { */ -/* if(!strcmp(block_ptr->bgl_block_name, */ -/* db2_info_ptr->bgl_block_name)) { */ -/* if(set == 2) */ -/* break; */ -/* set = 0; */ -/* break; */ -/* } */ -/* if((block_ptr->start[X]==db2_info_ptr->start[X] && */ -/* block_ptr->start[Y]==db2_info_ptr->start[Y] && */ -/* block_ptr->start[Z]==db2_info_ptr->start[Z]) && */ -/* (block_ptr->end[X]==db2_info_ptr->end[X] && */ -/* block_ptr->end[Y]==db2_info_ptr->end[Y] && */ -/* block_ptr->end[Z]==db2_info_ptr->end[Z])) { */ -/* set = 1; */ -/* break; */ -/* } */ - -/* if((block_ptr->start[X]<=db2_info_ptr->start[X] && */ -/* block_ptr->start[Y]<=db2_info_ptr->start[Y] && */ -/* block_ptr->start[Z]<=db2_info_ptr->start[Z]) && */ -/* (block_ptr->end[X]>=db2_info_ptr->end[X] && */ -/* block_ptr->end[Y]>=db2_info_ptr->end[Y] && */ -/* block_ptr->end[Z]>=db2_info_ptr->end[Z])) { */ -/* set = 2; */ -/* continue; */ -/* } */ -/* } */ -/* list_iterator_destroy(itr); */ -/* } */ - -/* if (set == 1) { */ -/* block_ptr->letter_num=db2_info_ptr->letter_num; */ -/* part.total_nodes += set_grid_bgl(block_ptr->start, */ -/* block_ptr->end, */ -/* block_ptr->letter_num, */ -/* set); */ -/* } else { */ -/* block_ptr->letter_num=*count; */ -/* part.total_nodes += set_grid_bgl(block_ptr->start, */ -/* block_ptr->end, */ -/* block_ptr->letter_num, */ -/* set); */ -/* (*count)++; */ -/* } */ part.total_nodes = block_ptr->size; if(block_ptr->slurm_part_name) part.name = block_ptr->slurm_part_name; diff --git a/src/smap/smap.c b/src/smap/smap.c index 062ae99a2f641c7cc9614ff3e0d11279f42000ba..adf929d6aefc17800d85667906395263194ef460 100644 --- a/src/smap/smap.c +++ b/src/smap/smap.c @@ -101,18 +101,24 @@ int main(int argc, char *argv[]) if(params.partition[0] == 'r') params.partition[0] = 'R'; if(params.partition[0] != 'R') { - char *rack_mid = find_bp_rack_mid(params.partition); + i = strlen(params.partition); + i -= 3; + if(i<0) { + printf("No real block was entered\n"); + goto part_fini; + } + char *rack_mid = find_bp_rack_mid(params.partition+i); if(rack_mid) printf("X=%c Y=%c Z=%c resolves to %s\n", - params.partition[X], - params.partition[Y], - params.partition[Z], + params.partition[X+i], + params.partition[Y+i], + params.partition[Z+i], rack_mid); else printf("X=%c Y=%c Z=%c has no resolve\n", - params.partition[X], - params.partition[Y], - params.partition[Z]); + params.partition[X+i], + params.partition[Y+i], + params.partition[Z+i]); } else { int *coord = find_bp_loc(params.partition); diff --git a/src/srun/msg.c b/src/srun/msg.c index 6fa5bdaa6305c73bacddd264dd18e21f5a561b8f..c0f6cab7ef0a45d9e5f0ca5ed7a1935132b9e2be 100644 --- a/src/srun/msg.c +++ b/src/srun/msg.c @@ -998,7 +998,10 @@ msg_thr_create(srun_job_t *job) if((job->forked_msg->par_msg->pid = fork()) == -1) return SLURM_ERROR; // there was an error else if (job->forked_msg->par_msg->pid == 0) - { // child: + { // child: +#ifdef DISABLE_LOCALTIME + disable_localtime(); +#endif setsid(); message_thread = 1; close(job->forked_msg->