Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
Slurm
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
tud-zih-energy
Slurm
Commits
f124958e
Commit
f124958e
authored
17 years ago
by
Moe Jette
Browse files
Options
Downloads
Patches
Plain Diff
Modify scheduling logic to better support overlapping partitions.
parent
11ce6ca8
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
NEWS
+1
-0
1 addition, 0 deletions
NEWS
src/slurmctld/job_scheduler.c
+54
-14
54 additions, 14 deletions
src/slurmctld/job_scheduler.c
with
55 additions
and
14 deletions
NEWS
+
1
−
0
View file @
f124958e
...
@@ -11,6 +11,7 @@ documents those changes that are of interest to users and admins.
...
@@ -11,6 +11,7 @@ documents those changes that are of interest to users and admins.
-- Re-write sched/backfill to utilize new will-run logic in the select
-- Re-write sched/backfill to utilize new will-run logic in the select
plugins. It now supports select/cons_res and all job options (required
plugins. It now supports select/cons_res and all job options (required
nodes, excluded nodes, contiguous, etc.).
nodes, excluded nodes, contiguous, etc.).
-- Modify scheduling logic to better support overlapping partitions.
* Changes in SLURM 1.3.0-pre8
* Changes in SLURM 1.3.0-pre8
=============================
=============================
...
...
This diff is collapsed.
Click to expand it.
src/slurmctld/job_scheduler.c
+
54
−
14
View file @
f124958e
...
@@ -174,6 +174,50 @@ extern void set_job_elig_time(void)
...
@@ -174,6 +174,50 @@ extern void set_job_elig_time(void)
unlock_slurmctld
(
job_write_lock
);
unlock_slurmctld
(
job_write_lock
);
}
}
/* Test of part_ptr can still run jobs or if its nodes have
* already been reserved by higher priority jobs (those in
* the failed_parts array) */
static
bool
_failed_partition
(
struct
part_record
*
part_ptr
,
struct
part_record
**
failed_parts
,
int
failed_part_cnt
)
{
int
i
;
for
(
i
=
0
;
i
<
failed_part_cnt
;
i
++
)
{
if
(
failed_parts
[
i
]
==
part_ptr
)
return
true
;
}
return
false
;
}
/* Add a partition to the failed_parts array, reserving its nodes
* from use by lower priority jobs. Also flags all partitions with
* nodes overlapping this partition. */
static
void
_add_failed_partition
(
struct
part_record
*
failed_part_ptr
,
struct
part_record
**
failed_parts
,
int
*
failed_part_cnt
)
{
int
count
=
*
failed_part_cnt
;
ListIterator
part_iterator
;
struct
part_record
*
part_ptr
;
failed_parts
[
count
++
]
=
failed_part_ptr
;
/* We also need to add partitions that have overlapping nodes */
part_iterator
=
list_iterator_create
(
part_list
);
while
((
part_ptr
=
(
struct
part_record
*
)
list_next
(
part_iterator
)))
{
if
((
part_ptr
==
failed_part_ptr
)
||
(
_failed_partition
(
part_ptr
,
failed_parts
,
count
))
||
(
!
bit_overlap
(
part_ptr
->
node_bitmap
,
failed_part_ptr
->
node_bitmap
)))
continue
;
failed_parts
[
count
++
]
=
part_ptr
;
}
list_iterator_destroy
(
part_iterator
);
*
failed_part_cnt
=
count
;
}
/*
/*
* schedule - attempt to schedule all pending jobs
* schedule - attempt to schedule all pending jobs
* pending jobs for each partition will be scheduled in priority
* pending jobs for each partition will be scheduled in priority
...
@@ -187,9 +231,9 @@ extern void set_job_elig_time(void)
...
@@ -187,9 +231,9 @@ extern void set_job_elig_time(void)
extern
int
schedule
(
void
)
extern
int
schedule
(
void
)
{
{
struct
job_queue
*
job_queue
;
struct
job_queue
*
job_queue
;
int
i
,
j
,
error_code
,
failed_part_cnt
,
job_queue_size
,
job_cnt
=
0
;
int
i
,
error_code
,
failed_part_cnt
=
0
,
job_queue_size
,
job_cnt
=
0
;
struct
job_record
*
job_ptr
;
struct
job_record
*
job_ptr
;
struct
part_record
**
failed_parts
;
struct
part_record
**
failed_parts
=
NULL
;
/* Locks: Read config, write job, write node, read partition */
/* Locks: Read config, write job, write node, read partition */
slurmctld_lock_t
job_write_lock
=
slurmctld_lock_t
job_write_lock
=
{
READ_LOCK
,
WRITE_LOCK
,
WRITE_LOCK
,
READ_LOCK
};
{
READ_LOCK
,
WRITE_LOCK
,
WRITE_LOCK
,
READ_LOCK
};
...
@@ -227,18 +271,17 @@ extern int schedule(void)
...
@@ -227,18 +271,17 @@ extern int schedule(void)
}
}
sort_job_queue
(
job_queue
,
job_queue_size
);
sort_job_queue
(
job_queue
,
job_queue_size
);
failed_part_cnt
=
0
;
failed_parts
=
xmalloc
(
sizeof
(
struct
part_record
*
)
*
failed_parts
=
NULL
;
list_count
(
part_list
));
for
(
i
=
0
;
i
<
job_queue_size
;
i
++
)
{
for
(
i
=
0
;
i
<
job_queue_size
;
i
++
)
{
job_ptr
=
job_queue
[
i
].
job_ptr
;
job_ptr
=
job_queue
[
i
].
job_ptr
;
if
(
job_ptr
->
priority
==
0
)
/* held */
if
(
job_ptr
->
priority
==
0
)
/* held */
continue
;
continue
;
for
(
j
=
0
;
j
<
failed_part_cnt
;
j
++
)
{
if
(
_failed_partition
(
job_ptr
->
part_ptr
,
failed_parts
,
if
(
failed_parts
[
j
]
==
job_ptr
->
part_ptr
)
failed_part_cnt
))
{
break
;
}
if
(
j
<
failed_part_cnt
)
continue
;
continue
;
}
error_code
=
select_nodes
(
job_ptr
,
false
,
NULL
);
error_code
=
select_nodes
(
job_ptr
,
false
,
NULL
);
if
(
error_code
==
ESLURM_NODES_BUSY
)
{
if
(
error_code
==
ESLURM_NODES_BUSY
)
{
...
@@ -256,11 +299,8 @@ extern int schedule(void)
...
@@ -256,11 +299,8 @@ extern int schedule(void)
* group all Blue Gene job partitions of type
* group all Blue Gene job partitions of type
* 2x2x2 coprocessor mesh into a single SLURM
* 2x2x2 coprocessor mesh into a single SLURM
* partition, say "co-mesh-222") */
* partition, say "co-mesh-222") */
xrealloc
(
failed_parts
,
_add_failed_partition
(
job_ptr
->
part_ptr
,
failed_parts
,
(
failed_part_cnt
+
1
)
*
&
failed_part_cnt
);
sizeof
(
struct
part_record
*
));
failed_parts
[
failed_part_cnt
++
]
=
job_ptr
->
part_ptr
;
#endif
#endif
}
else
if
(
error_code
==
SLURM_SUCCESS
)
{
}
else
if
(
error_code
==
SLURM_SUCCESS
)
{
/* job initiated */
/* job initiated */
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment