From c575466317f09bc083587089d1044585f168c5a2 Mon Sep 17 00:00:00 2001
From: Danny Auble <da@llnl.gov>
Date: Mon, 8 Jun 2009 19:24:52 +0000
Subject: [PATCH] svn merge -r17738:17749
 https://eris.llnl.gov/svn/slurm/branches/slurm-2.0

---
 NEWS                                          |  6 +++++
 doc/html/power_save.shtml                     |  4 ----
 doc/man/man5/slurm.conf.5                     |  4 ----
 src/common/slurm_protocol_api.c               | 24 ++++++++++++++-----
 .../mysql/mysql_jobacct_process.c             | 18 +++++++-------
 src/slurmctld/power_save.c                    | 15 ++++++++----
 6 files changed, 44 insertions(+), 27 deletions(-)

diff --git a/NEWS b/NEWS
index 4e60fb0257a..d7b27ed8aba 100644
--- a/NEWS
+++ b/NEWS
@@ -46,6 +46,12 @@ documents those changes that are of interest to users and admins.
  -- Fix node weight (scheduling priority) calculation for powered down
     nodes. Patch from Hongjia Cao, NUDT.
  -- Fix node suspend/resume rate calculations. Patch from Hongjia Cao, NUDT.
+ -- Change calculations using ResumeRate and SuspendRate to provide higher
+    resolution.
+ -- Log the IP address for incomming messages having an invalid protocol 
+    version number.
+ -- Fix for sacct to show jobs that start the same second as the sacct
+    command is issued.
 
 * Changes in SLURM 2.0.1
 ========================
diff --git a/doc/html/power_save.shtml b/doc/html/power_save.shtml
index 0254e701892..763584b42d0 100644
--- a/doc/html/power_save.shtml
+++ b/doc/html/power_save.shtml
@@ -43,8 +43,6 @@ Maximum number of nodes to be placed into power saving mode
 per minute. 
 A value of zero results in no limits being imposed.
 The default value is 60.
-Rate calculations are performed using integers, so rounding errors
-may be significant for very small values (1 or 2 nodes per minute).
 Use this to prevent rapid drops in power requirements.</li>
 
 <li><b>ResumeRate</b>:
@@ -52,8 +50,6 @@ Maximum number of nodes to be removed from power saving mode
 per minute. 
 A value of zero results in no limits being imposed.
 The default value is 300.
-Rate calculations are performed using integers, so rounding errors
-may be significant for very small values (1 or 2 nodes per minute).
 Use this to prevent rapid increases in power requirements.</li>
 
 <li><b>SuspendProgram</b>:
diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5
index e1d931226ef..d82014881cf 100644
--- a/doc/man/man5/slurm.conf.5
+++ b/doc/man/man5/slurm.conf.5
@@ -1020,8 +1020,6 @@ power surges if a large number of nodes in power save mode are
 assigned work at the same time (e.g. a large job starts).
 A value of zero results in no limits being imposed. 
 The default value is 300 nodes per minute.
-Rate calculations are performed using integers, so rounding errors
-may be significant for very small values (1 or 2 nodes per minute).
 Related configuration options include \fBResumeTimeout\fR, \fBResumeProgram\fR, 
 \fBSuspendRate\fR, \fBSuspendTime\fR, \fBResumeTimeout\fR, \fBSuspendProgram\fR, 
 \fBSuspendExcNodes\fR, and \fBSuspendExcParts\fR.
@@ -1443,8 +1441,6 @@ The value is number of nodes per minute and it can be used to prevent
 a large drop in power power consumption (e.g. after a large job completes).
 A value of zero results in no limits being imposed.
 The default value is 60 nodes per minute.
-Rate calculations are performed using integers, so rounding errors
-may be significant for very small values (1 or 2 nodes per minute).
 Related configuration options include \fBResumeTimeout\fR, \fBResumeProgram\fR, 
 \fBResumeRate\fR, \fBSuspendProgram\fR, \fBSuspendTime\fR, \fBSuspendTimeout\fR, 
 \fBSuspendExcNodes\fR, and \fBSuspendExcParts\fR.
diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c
index fb8c0ee9387..1ab6214c1b8 100644
--- a/src/common/slurm_protocol_api.c
+++ b/src/common/slurm_protocol_api.c
@@ -1905,9 +1905,13 @@ int slurm_receive_msg(slurm_fd fd, slurm_msg_t *msg, int timeout)
 	}
 	
 	if (check_header_version(&header) < 0) {
+		slurm_addr resp_addr;
+		char addr_str[32];
 		int uid = _unpack_msg_uid(buffer);
-		error("Invalid Protocol Version %u from uid=%d", 
-			header.version, uid);
+		slurm_get_peer_addr(fd, &resp_addr);
+		slurm_print_slurm_addr(&resp_addr, addr_str, sizeof(addr_str));
+		error("Invalid Protocol Version %u from uid=%d at %s", 
+			header.version, uid, addr_str);
 		free_buf(buffer);
 		rc = SLURM_PROTOCOL_VERSION_ERROR;
 		goto total_return;
@@ -2067,9 +2071,13 @@ List slurm_receive_msgs(slurm_fd fd, int steps, int timeout)
 	}
 	
 	if(check_header_version(&header) < 0) {
+		slurm_addr resp_addr;
+		char addr_str[32];
 		int uid = _unpack_msg_uid(buffer);
-		error("Invalid Protocol Version %u from uid=%d",
-			header.version, uid);
+		slurm_get_peer_addr(fd, &resp_addr);
+		slurm_print_slurm_addr(&resp_addr, addr_str, sizeof(addr_str));
+		error("Invalid Protocol Version %u from uid=%d at %s", 
+			header.version, uid, addr_str);
 		free_buf(buffer);
 		rc = SLURM_PROTOCOL_VERSION_ERROR;
 		goto total_return;
@@ -2250,9 +2258,13 @@ int slurm_receive_msg_and_forward(slurm_fd fd, slurm_addr *orig_addr,
 	}
 	
 	if (check_header_version(&header) < 0) {
+		slurm_addr resp_addr;
+		char addr_str[32];
 		int uid = _unpack_msg_uid(buffer);
-		error("Invalid Protocol Version %u from uid=%d", 
-			header.version, uid);
+		slurm_get_peer_addr(fd, &resp_addr);
+		slurm_print_slurm_addr(&resp_addr, addr_str, sizeof(addr_str));
+		error("Invalid Protocol Version %u from uid=%d at %s", 
+			header.version, uid, addr_str);
 		free_buf(buffer);
 		rc = SLURM_PROTOCOL_VERSION_ERROR;
 		goto total_return;
diff --git a/src/plugins/accounting_storage/mysql/mysql_jobacct_process.c b/src/plugins/accounting_storage/mysql/mysql_jobacct_process.c
index 663ef55da69..938c163ee22 100644
--- a/src/plugins/accounting_storage/mysql/mysql_jobacct_process.c
+++ b/src/plugins/accounting_storage/mysql/mysql_jobacct_process.c
@@ -470,7 +470,6 @@ extern int setup_job_cond_limits(mysql_conn_t *mysql_conn,
 	char *object = NULL;
 	char *table_level = "t2";
 	jobacct_selected_step_t *selected_step = NULL;
-	time_t now = time(NULL);
 
 	if(!job_cond)
 		return 0;
@@ -651,17 +650,20 @@ extern int setup_job_cond_limits(mysql_conn_t *mysql_conn,
 	}
 
 	if(job_cond->usage_start) {
-		if(!job_cond->usage_end)
-			job_cond->usage_end = now;
-
 		if(*extra)
 			xstrcat(*extra, " && (");
 		else
 			xstrcat(*extra, " where (");
-		xstrfmtcat(*extra, 
-			   "(t1.eligible < %d "
-			   "&& (t1.end >= %d || t1.end = 0)))",
-			   job_cond->usage_end, job_cond->usage_start);
+
+		if(!job_cond->usage_end)
+			xstrfmtcat(*extra, 
+				   "t1.end >= %d || t1.end = 0)",
+				   job_cond->usage_start);
+		else
+			xstrfmtcat(*extra, 
+				   "(t1.eligible < %d "
+				   "&& (t1.end >= %d || t1.end = 0)))",
+				   job_cond->usage_end, job_cond->usage_start);
 	} else if(job_cond->usage_end) {
 		if(*extra)
 			xstrcat(*extra, " && (");
diff --git a/src/slurmctld/power_save.c b/src/slurmctld/power_save.c
index d16a5719982..9e2ae879a81 100644
--- a/src/slurmctld/power_save.c
+++ b/src/slurmctld/power_save.c
@@ -83,7 +83,8 @@ time_t last_config = (time_t) 0, last_suspend = (time_t) 0;
 uint16_t slurmd_timeout;
 
 bitstr_t *exc_node_bitmap = NULL, *suspend_node_bitmap = NULL;
-int suspend_cnt, resume_cnt;
+int   suspend_cnt,   resume_cnt;
+float suspend_cnt_f, resume_cnt_f;
 
 static void  _clear_power_config(void);
 static void  _do_power_work(void);
@@ -111,13 +112,15 @@ static void _do_power_work(void)
 	/* Set limit on counts of nodes to have state changed */
 	delta_t = now - last_work_scan;
 	if (delta_t >= 60) {
-		suspend_cnt = 0;
-		resume_cnt  = 0;
+		suspend_cnt_f = 0.0;
+		resume_cnt_f  = 0.0;
 	} else {
 		float rate = (60 - delta_t) / 60.0;
-		suspend_cnt *= rate;
-		resume_cnt  *= rate;
+		suspend_cnt_f *= rate;
+		resume_cnt_f  *= rate;
 	}
+	suspend_cnt = (suspend_cnt_f + 0.5);
+	resume_cnt  = (resume_cnt_f  + 0.5);
 
 	if (now > (last_suspend + suspend_timeout)) {
 		/* ready to start another round of node suspends */
@@ -151,6 +154,7 @@ static void _do_power_work(void)
 			}
 			wake_cnt++;
 			resume_cnt++;
+			resume_cnt_f++;
 			node_ptr->node_state &= (~NODE_STATE_POWER_SAVE);
 			bit_clear(power_node_bitmap, i);
 			node_ptr->node_state   |= NODE_STATE_NO_RESPOND;
@@ -173,6 +177,7 @@ static void _do_power_work(void)
 			}
 			sleep_cnt++;
 			suspend_cnt++;
+			suspend_cnt_f++;
 			node_ptr->node_state |= NODE_STATE_POWER_SAVE;
 			bit_set(power_node_bitmap, i);
 			bit_set(sleep_node_bitmap,   i);
-- 
GitLab