Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
Slurm
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
tud-zih-energy
Slurm
Commits
f2b890a3
Commit
f2b890a3
authored
17 years ago
by
Moe Jette
Browse files
Options
Downloads
Patches
Plain Diff
major restructuring of wiki will_run command logic.
parent
dc73dca7
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
src/plugins/sched/wiki2/job_will_run.c
+116
-113
116 additions, 113 deletions
src/plugins/sched/wiki2/job_will_run.c
src/plugins/sched/wiki2/msg.c
+2
-1
2 additions, 1 deletion
src/plugins/sched/wiki2/msg.c
with
118 additions
and
114 deletions
src/plugins/sched/wiki2/job_will_run.c
+
116
−
113
View file @
f2b890a3
...
...
@@ -41,17 +41,30 @@
#include
"src/slurmctld/node_scheduler.h"
#include
"src/slurmctld/state_save.h"
static
char
*
_copy_nodelist_no_dup
(
char
*
node_list
);
static
int
_will_run_test
(
uint32_t
jobid
,
char
*
hostlist
,
int
*
err_code
,
char
**
err_msg
);
static
char
*
_will_run_test
(
uint32_t
jobid
,
char
*
job_list
,
char
*
exclude_list
,
int
*
err_code
,
char
**
err_msg
);
/* RET 0 on success, -1 on failure */
/*
* get_jobs - get information on specific job(s) changed since some time
* cmd_ptr IN - CMD=JOBWILLRUN ARG=<JOBID> AFTER=<JOBID>[:<JOBID>...]
* [EXCLUDE=<node_list>]
* err_code OUT - 0 on success or some error code
* err_msg OUT - error message or the JOBID from ordered list after
* which the specified job can start (no JOBID if job
* can start immediately) and the assigned node list.
* ARG=<JOBID> [AFTER=<JOBID>] NODES=<node_list>
* NOTE: xfree() err_msg if err_code is zero
* RET 0 on success, -1 on failure
*/
extern
int
job_will_run
(
char
*
cmd_ptr
,
int
*
err_code
,
char
**
err_msg
)
{
char
*
arg_ptr
,
*
task_ptr
,
*
node_ptr
,
*
tmp_char
;
int
i
;
char
*
arg_ptr
,
*
tmp_char
,
*
job_list
,
*
exclude_list
;
char
*
buf
,
*
tmp_buf
;
int
buf_size
;
uint32_t
jobid
;
char
host_string
[
MAXHOSTRANGELEN
];
/* Locks: write job, read node and partition info */
slurmctld_lock_t
job_write_lock
=
{
NO_LOCK
,
WRITE_LOCK
,
READ_LOCK
,
READ_LOCK
};
arg_ptr
=
strstr
(
cmd_ptr
,
"ARG="
);
if
(
arg_ptr
==
NULL
)
{
...
...
@@ -68,114 +81,120 @@ extern int job_will_run(char *cmd_ptr, int *err_code, char **err_msg)
return
-
1
;
}
task_ptr
=
strstr
(
cmd_ptr
,
"TASKLIST="
);
if
(
task_ptr
)
{
hostlist_t
hl
;
node_ptr
=
task_ptr
+
9
;
for
(
i
=
0
;
node_ptr
[
i
]
!=
'\0'
;
i
++
)
{
if
(
node_ptr
[
i
]
==
':'
)
node_ptr
[
i
]
=
','
;
}
hl
=
hostlist_create
(
node_ptr
);
i
=
hostlist_ranged_string
(
hl
,
sizeof
(
host_string
),
host_string
);
hostlist_destroy
(
hl
);
if
(
i
<
0
)
{
*
err_code
=
-
300
;
*
err_msg
=
"JOBWILLRUN has invalid TASKLIST"
;
error
(
"wiki: JOBWILLRUN has invalid TASKLIST"
);
return
-
1
;
}
job_list
=
strstr
(
cmd_ptr
,
"AFTER="
);
if
(
job_list
)
{
job_list
+=
6
;
null_term
(
job_list
);
}
else
{
/* no restrictions on nodes available for use */
strcpy
(
host_string
,
""
);
*
err_code
=
-
300
;
*
err_msg
=
"Invalid AFTER value"
;
error
(
"wiki: JOBWILLRUN has invalid jobid"
);
return
-
1
;
}
if
(
_will_run_test
(
jobid
,
host_string
,
err_code
,
err_msg
)
!=
0
)
exclude_list
=
strstr
(
cmd_ptr
,
"EXCLUDE="
);
if
(
exclude_list
)
{
exclude_list
+=
8
;
null_term
(
exclude_list
);
}
lock_slurmctld
(
job_write_lock
);
buf
=
_will_run_test
(
jobid
,
job_list
,
exclude_list
,
err_code
,
err_msg
);
unlock_slurmctld
(
job_write_lock
);
if
(
!
buf
)
{
info
(
"wiki: JOBWILLRUN failed for job %u"
,
jobid
);
return
-
1
;
}
buf_size
=
strlen
(
buf
);
tmp_buf
=
xmalloc
(
buf_size
+
32
);
sprintf
(
tmp_buf
,
"SC=0 ARG=%s"
,
buf
);
xfree
(
buf
);
*
err_code
=
0
;
*
err_msg
=
tmp_buf
;
return
0
;
}
static
int
_will_run_test
(
uint32_t
jobid
,
char
*
host
list
,
int
*
err_code
,
char
**
err_msg
)
static
char
*
_will_run_test
(
uint32_t
jobid
,
char
*
job_
list
,
char
*
exclude_list
,
int
*
err_code
,
char
**
err_msg
)
{
int
rc
=
0
,
i
;
struct
job_record
*
job_ptr
;
/* Write lock on job info, read lock on node info */
slurmctld_lock_t
job_write_lock
=
{
NO_LOCK
,
WRITE_LOCK
,
READ_LOCK
,
NO_LOCK
};
char
*
new_node_list
,
*
picked_node_list
=
NULL
;
bitstr_t
*
new_bitmap
,
*
save_exc_bitmap
,
*
save_req_bitmap
;
uint32_t
save_prio
;
bitstr_t
*
picked_node_bitmap
=
NULL
;
/* Just create a big static message buffer to avoid dealing with
* xmalloc/xfree. We'll switch to compressed node naming soon
* and this buffer can be set smaller then. */
static
char
reply_msg
[
16384
];
bitstr_t
*
save_exc_bitmap
=
NULL
,
*
new_bitmap
=
NULL
;
uint32_t
save_prio
,
*
jobid_list
=
NULL
;
struct
job_record
**
job_ptr_list
;
int
i
,
job_list_size
;
char
*
tmp_char
;
lock_slurmctld
(
job_write_lock
);
job_ptr
=
find_job_record
(
jobid
);
if
(
job_ptr
==
NULL
)
{
*
err_code
=
-
700
;
*
err_msg
=
"No such job"
;
error
(
"wiki: Failed to find job %u"
,
jobid
);
rc
=
-
1
;
unlock_slurmctld
(
job_write_lock
);
return
rc
;
return
NULL
;
}
if
((
job_ptr
->
details
==
NULL
)
||
(
job_ptr
->
job_state
!=
JOB_PENDING
))
{
if
((
job_ptr
->
details
==
NULL
)
||
(
job_ptr
->
job_state
!=
JOB_PENDING
))
{
*
err_code
=
-
700
;
*
err_msg
=
"Job not pending, can't test will_run"
;
error
(
"wiki: Attempt to test will_run of non-pending job %u"
,
jobid
);
rc
=
-
1
;
unlock_slurmctld
(
job_write_lock
);
return
rc
;
}
new_node_list
=
_copy_nodelist_no_dup
(
hostlist
);
if
(
hostlist
&&
(
new_node_list
==
NULL
))
{
*
err_code
=
-
700
;
*
err_msg
=
"Invalid TASKLIST"
;
error
(
"wiki: Attempt to set invalid node list for job %u, %s"
,
jobid
,
hostlist
);
rc
=
-
1
;
unlock_slurmctld
(
job_write_lock
);
return
rc
;
return
NULL
;
}
if
(
node_name2bitmap
(
new_node_list
,
false
,
&
new_bitmap
)
!=
0
)
{
*
err_code
=
-
700
;
*
err_msg
=
"Invalid TASKLIST"
;
error
(
"wiki: Attempt to set invalid node list for job %u, %s"
,
jobid
,
hostlist
);
rc
=
-
1
;
xfree
(
new_node_list
);
unlock_slurmctld
(
job_write_lock
);
return
rc
;
/* parse the job list */
job_list_size
=
strlen
(
job_list
)
+
1
;
jobid_list
=
xmalloc
(
job_list_size
*
sizeof
(
uint32_t
));
job_ptr_list
=
xmalloc
(
job_list_size
*
sizeof
(
struct
job_record
*
));
tmp_char
=
job_list
;
for
(
i
=
0
;
i
<
job_list_size
;
)
{
jobid_list
[
i
]
=
strtoul
(
tmp_char
,
&
tmp_char
,
10
);
if
((
tmp_char
[
0
]
!=
'\0'
)
&&
(
!
isspace
(
tmp_char
[
0
]))
&&
(
tmp_char
[
0
]
!=
':'
))
{
*
err_code
=
-
300
;
*
err_msg
=
"Invalid AFTER value"
;
error
(
"wiki: Invalid AFTER value of %s"
,
job_list
);
xfree
(
jobid_list
);
xfree
(
job_ptr_list
);
return
NULL
;
}
job_ptr_list
[
i
]
=
find_job_record
(
jobid_list
[
i
]);
if
(
job_ptr_list
[
i
])
i
++
;
else
{
error
(
"wiki: willrun AFTER job %u not found"
,
jobid_list
[
i
]);
jobid_list
[
i
]
=
0
;
}
if
(
tmp_char
[
0
]
==
':'
)
tmp_char
++
;
else
break
;
}
/* Put the inverse of this on the excluded node list,
* Remove any required nodes, and test */
save_exc_bitmap
=
job_ptr
->
details
->
exc_node_bitmap
;
if
(
hostlist
[
0
])
{
/* empty hostlist, all nodes usable */
bit_not
(
new_bitmap
);
if
(
exclude_list
)
{
if
(
node_name2bitmap
(
exclude_list
,
false
,
&
new_bitmap
)
!=
0
)
{
*
err_code
=
-
700
;
*
err_msg
=
"Invalid EXCLUDE value"
;
error
(
"wiki: Attempt to set invalid exclude node "
"list for job %u, %s"
,
jobid
,
exclude_list
);
return
NULL
;
}
save_exc_bitmap
=
job_ptr
->
details
->
exc_node_bitmap
;
job_ptr
->
details
->
exc_node_bitmap
=
new_bitmap
;
}
save_req_bitmap
=
job_ptr
->
details
->
req_node_bitmap
;
job_ptr
->
details
->
req_node_bitmap
=
bit_alloc
(
node_record_count
);
/* test when the job can execute */
save_prio
=
job_ptr
->
priority
;
job_ptr
->
priority
=
1
;
#if 0
/* execute will_run logic here */
/* Note that last jobid_list entry has a value of zero */
rc = select_nodes(job_ptr, true, &picked_node_bitmap);
if
(
picked_node_bitmap
)
{
picked_node_list
=
bitmap2wiki_node_name
(
picked_node_bitmap
);
i
=
strlen
(
picked_node_list
);
if
((
i
+
64
)
>
sizeof
(
reply_msg
))
error
(
"wiki: will_run buffer overflow"
);
}
if (rc == SLURM_SUCCESS) {
*err_code = 0;
...
...
@@ -186,13 +205,13 @@ static int _will_run_test(uint32_t jobid, char *hostlist,
} else if (rc == ESLURM_NODES_BUSY) {
*err_code = 1;
snprintf(reply_msg, sizeof(reply_msg),
"SC=1 Job %
d
runnable later TASKLIST:%s"
,
"SC=1 Job %
u
runnable later TASKLIST:%s",
jobid, picked_node_list);
*err_msg = reply_msg;
} else if (rc == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) {
*err_code = 1;
snprintf(reply_msg, sizeof(reply_msg),
"SC=1 Job %
d
not runnable with current configuration"
,
"SC=1 Job %
u
not runnable with current configuration",
jobid);
*err_msg = reply_msg;
} else {
...
...
@@ -205,37 +224,21 @@ static int _will_run_test(uint32_t jobid, char *hostlist,
jobid, err_str);
*err_msg = reply_msg;
}
#endif
/* Restore job's state, release memory */
xfree
(
picked_node_list
);
FREE_NULL_BITMAP
(
picked_node_bitmap
);
xfree
(
new_node_list
);
bit_free
(
new_bitmap
);
FREE_NULL_BITMAP
(
job_ptr
->
details
->
req_node_bitmap
);
job_ptr
->
details
->
exc_node_bitmap
=
save_exc_bitmap
;
job_ptr
->
details
->
req_node_bitmap
=
save_req_bitmap
;
/* Restore job's state, release allocated memory */
if
(
save_exc_bitmap
)
job_ptr
->
details
->
exc_node_bitmap
=
save_exc_bitmap
;
FREE_NULL_BITMAP
(
new_bitmap
);
job_ptr
->
priority
=
save_prio
;
unlock_slurmctld
(
job_write_lock
);
return
rc
;
}
xfree
(
jobid_list
);
xfree
(
job_ptr_list
);
static
char
*
_copy_nodelist_no_dup
(
char
*
node_list
)
{
int
new_size
=
128
;
char
*
new_str
;
hostlist_t
hl
=
hostlist_create
(
node_list
);
if
(
hl
==
NULL
)
return
NULL
;
hostlist_uniq
(
hl
);
new_str
=
xmalloc
(
new_size
);
while
(
hostlist_ranged_string
(
hl
,
new_size
,
new_str
)
==
-
1
)
{
new_size
*=
2
;
xrealloc
(
new_str
,
new_size
);
}
hostlist_destroy
(
hl
);
return
new_str
;
#if 1
*
err_code
=
-
810
;
*
err_msg
=
"JOBWILLRUN not yet supported"
;
return
NULL
;
#endif
}
/*
...
...
This diff is collapsed.
Click to expand it.
src/plugins/sched/wiki2/msg.c
+
2
−
1
View file @
f2b890a3
...
...
@@ -626,7 +626,8 @@ static void _proc_msg(slurm_fd new_fd, char *msg)
job_release_task
(
cmd_ptr
,
&
err_code
,
&
err_msg
);
}
else
if
(
strncmp
(
cmd_ptr
,
"JOBWILLRUN"
,
10
)
==
0
)
{
msg_type
=
"wiki:JOBWILLRUN"
;
job_will_run
(
cmd_ptr
,
&
err_code
,
&
err_msg
);
if
(
!
job_will_run
(
cmd_ptr
,
&
err_code
,
&
err_msg
))
goto
free_resp_msg
;
}
else
if
(
strncmp
(
cmd_ptr
,
"MODIFYJOB"
,
9
)
==
0
)
{
msg_type
=
"wiki:MODIFYJOB"
;
job_modify_wiki
(
cmd_ptr
,
&
err_code
,
&
err_msg
);
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment