diff --git a/NEWS b/NEWS index db0de12db25b04d18fe85cbc1c42fff1d12a4648..608c733050eb352339eaf61cf754274837deac0e 100644 --- a/NEWS +++ b/NEWS @@ -261,6 +261,13 @@ documents those changes that are of interest to users and admins. resources released by the preempted jobs. -- Fix SelectTypeParameters=CR_PACK_NODES for srun making both job and step resource allocation. + -- BGQ - Make it possible to pack multiple tasks on a core when not using + the entire cnode. + -- MYSQL - if unable to connect to mysqld close connection that was inited. + -- DBD - when connecting make sure we wait MessageTimeout + 5 since the + timeout when talking to the Database is the same timeout so a race + condition could occur in the requesting client when receiving the response + if the database is unresponsive. * Changes in Slurm 14.03.6 ========================== diff --git a/src/common/proc_args.c b/src/common/proc_args.c index c9f941078eab4cb36e88bb2e59aa802c6f7ae27a..67330ff4d8257124db9bc6d5b60bee88e7d4a4e3 100644 --- a/src/common/proc_args.c +++ b/src/common/proc_args.c @@ -1213,7 +1213,7 @@ extern void bg_figure_nodes_tasks(int *min_nodes, int *max_nodes, "for you.", *ntasks_per_node, node_cnt, ntpn); *ntasks_per_node = ntpn; - } else if ((node_cnt * ntpn) > *ntasks) { + } else if (!overcommit && ((node_cnt * ntpn) > *ntasks)) { ntpn = (*ntasks + node_cnt - 1) / node_cnt; while (!_check_is_pow_of_2(ntpn)) ntpn++; diff --git a/src/common/slurmdbd_defs.c b/src/common/slurmdbd_defs.c index 826e202ae73c3a9cb93015a14fce562d6d586b70..82076103adc3dc57acc4c99ac27d48635a3749e8 100644 --- a/src/common/slurmdbd_defs.c +++ b/src/common/slurmdbd_defs.c @@ -1564,7 +1564,14 @@ static int _send_init_msg() return rc; } - read_timeout = slurm_get_msg_timeout() * 1000; + /* Add 35 seconds here to make sure the DBD has enough time to + process the request. 30 seconds is defined in + src/database/mysql_common.c in mysql_db_get_db_connection + as the time to wait for a mysql connection and 5 seconds to + avoid a race condition since it could time out at the + same rate and not leave any time to send the response back. + */ + read_timeout = (slurm_get_msg_timeout() + 35) * 1000; rc = _get_return_code(SLURM_PROTOCOL_VERSION, read_timeout); if (tmp_errno) errno = tmp_errno; @@ -2124,12 +2131,12 @@ static void *_agent(void *x) break; } list_iterator_destroy(agent_itr); - buffer = pack_slurmdbd_msg(&list_req, - SLURM_PROTOCOL_VERSION); + buffer = pack_slurmdbd_msg( + &list_req, SLURM_PROTOCOL_VERSION); } else if (cnt > 1) { list_msg.my_list = agent_list; - buffer = pack_slurmdbd_msg(&list_req, - SLURM_PROTOCOL_VERSION); + buffer = pack_slurmdbd_msg( + &list_req, SLURM_PROTOCOL_VERSION); } else buffer = (Buf) list_peek(agent_list); } else @@ -2160,7 +2167,8 @@ static void *_agent(void *x) rc = _handle_mult_rc_ret(SLURM_PROTOCOL_VERSION, read_timeout); } else { - rc = _get_return_code(SLURM_PROTOCOL_VERSION, read_timeout); + rc = _get_return_code(SLURM_PROTOCOL_VERSION, + read_timeout); if (rc == EAGAIN) { if (agent_shutdown) { slurm_mutex_unlock(&slurmdbd_lock); diff --git a/src/database/mysql_common.c b/src/database/mysql_common.c index fd1c84b58a03ab436041704f31887dbfeac15ca1..4cb13b8ca464110dcfcc049b6f9f62f61e1d5d0a 100644 --- a/src/database/mysql_common.c +++ b/src/database/mysql_common.c @@ -664,6 +664,11 @@ extern int mysql_db_get_db_connection(mysql_conn_t *mysql_conn, char *db_name, fatal("mysql_init failed: %s", mysql_error(mysql_conn->db_conn)); } else { + /* If this ever changes you will need to alter + * src/common/slurmdbd_defs.c function _send_init_msg to + * handle a different timeout when polling for the + * response. + */ unsigned int my_timeout = 30; #ifdef MYSQL_OPT_RECONNECT my_bool reconnect = 1; @@ -696,6 +701,8 @@ extern int mysql_db_get_db_connection(mysql_conn_t *mysql_conn, char *db_name, } rc = ESLURM_DB_CONNECTION; + mysql_close(mysql_conn->db_conn); + mysql_conn->db_conn = NULL; break; } } else {