Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
Slurm
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
tud-zih-energy
Slurm
Commits
82f8ef55
Commit
82f8ef55
authored
22 years ago
by
Moe Jette
Browse files
Options
Downloads
Patches
Plain Diff
Return job a submit error if unable to write script and environment to
spooling file.
parent
d104ab6f
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/slurmctld/job_mgr.c
+69
-48
69 additions, 48 deletions
src/slurmctld/job_mgr.c
with
69 additions
and
48 deletions
src/slurmctld/job_mgr.c
+
69
−
48
View file @
82f8ef55
/*****************************************************************************\
/*****************************************************************************\
* job_mgr.c - manage the job information of slurm
* job_mgr.c - manage the job information of slurm
* Note: there is a global job list (job_list), job_count, time stamp
* Note: there is a global job list (job_list), job_count, time stamp
* (last_job_update), and hash table (job_hash, job_hash_over, max_hash_over)
* (last_job_update), and hash table (job_hash, job_hash_over,
* max_hash_over)
*****************************************************************************
*****************************************************************************
* Copyright (C) 2002 The Regents of the University of California.
* Copyright (C) 2002 The Regents of the University of California.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
...
@@ -82,10 +83,12 @@ static struct job_record *job_hash_over[MAX_JOB_COUNT];
...
@@ -82,10 +83,12 @@ static struct job_record *job_hash_over[MAX_JOB_COUNT];
static
int
max_hash_over
=
0
;
static
int
max_hash_over
=
0
;
void
add_job_hash
(
struct
job_record
*
job_ptr
);
void
add_job_hash
(
struct
job_record
*
job_ptr
);
int
copy_job_desc_to_file
(
job_desc_msg_t
*
job_desc
,
uint32_t
job_id
)
;
static
int
_copy_job_desc_to_file
(
job_desc_msg_t
*
job_desc
,
int
copy_job_desc_to_job_record
(
job_desc_msg_t
*
job_desc
,
uint32_t
job_id
)
;
struct
job_record
**
job_ptr
,
struct
part_record
*
part_ptr
,
static
int
_copy_job_desc_to_job_record
(
job_desc_msg_t
*
job_desc
,
bitstr_t
*
req_bitmap
)
;
struct
job_record
**
job_ptr
,
struct
part_record
*
part_ptr
,
bitstr_t
*
req_bitmap
)
;
void
delete_job_desc_files
(
uint32_t
job_id
);
void
delete_job_desc_files
(
uint32_t
job_id
);
void
dump_job_state
(
struct
job_record
*
dump_job_ptr
,
Buf
buffer
);
void
dump_job_state
(
struct
job_record
*
dump_job_ptr
,
Buf
buffer
);
void
dump_job_details_state
(
struct
job_details
*
detail_ptr
,
Buf
buffer
);
void
dump_job_details_state
(
struct
job_details
*
detail_ptr
,
Buf
buffer
);
...
@@ -100,8 +103,8 @@ void read_data_array_from_file ( char * file_name, char *** data, uint16_t *size
...
@@ -100,8 +103,8 @@ void read_data_array_from_file ( char * file_name, char *** data, uint16_t *size
void
signal_job_on_node
(
uint32_t
job_id
,
uint16_t
step_id
,
int
signum
,
char
*
node_name
);
void
signal_job_on_node
(
uint32_t
job_id
,
uint16_t
step_id
,
int
signum
,
char
*
node_name
);
int
top_priority
(
struct
job_record
*
job_ptr
);
int
top_priority
(
struct
job_record
*
job_ptr
);
int
validate_job_desc
(
job_desc_msg_t
*
job_desc_msg
,
int
allocate
)
;
int
validate_job_desc
(
job_desc_msg_t
*
job_desc_msg
,
int
allocate
)
;
int
write_data_to_file
(
char
*
file_name
,
char
*
data
)
;
static
int
_
write_data_to_file
(
char
*
file_name
,
char
*
data
)
;
int
write_data_array_to_file
(
char
*
file_name
,
char
**
data
,
uint16_t
size
)
;
static
int
_
write_data_array_to_file
(
char
*
file_name
,
char
**
data
,
uint16_t
size
)
;
static
inline
void
x_clear
(
void
*
arg
);
static
inline
void
x_clear
(
void
*
arg
);
/*
/*
...
@@ -1196,8 +1199,11 @@ job_create ( job_desc_msg_t *job_desc, uint32_t *new_job_id, int allocate,
...
@@ -1196,8 +1199,11 @@ job_create ( job_desc_msg_t *job_desc, uint32_t *new_job_id, int allocate,
return
error_code
;
return
error_code
;
}
}
/* check if select partition has sufficient resources to satisfy request */
/* check if select partition has sufficient resources to satisfy
if
(
job_desc
->
req_nodes
)
{
/* insure that selected nodes are in this partition */
* the request */
/* insure that selected nodes are in this partition */
if
(
job_desc
->
req_nodes
)
{
error_code
=
node_name2bitmap
(
job_desc
->
req_nodes
,
&
req_bitmap
);
error_code
=
node_name2bitmap
(
job_desc
->
req_nodes
,
&
req_bitmap
);
if
(
error_code
==
EINVAL
)
if
(
error_code
==
EINVAL
)
goto
cleanup
;
goto
cleanup
;
...
@@ -1209,7 +1215,7 @@ job_create ( job_desc_msg_t *job_desc, uint32_t *new_job_id, int allocate,
...
@@ -1209,7 +1215,7 @@ job_create ( job_desc_msg_t *job_desc, uint32_t *new_job_id, int allocate,
bit_fill_gaps
(
req_bitmap
);
bit_fill_gaps
(
req_bitmap
);
if
(
bit_super_set
(
req_bitmap
,
part_ptr
->
node_bitmap
)
!=
1
)
{
if
(
bit_super_set
(
req_bitmap
,
part_ptr
->
node_bitmap
)
!=
1
)
{
info
(
"job_create: requested nodes %s not in partition %s"
,
info
(
"job_create: requested nodes %s not in partition %s"
,
job_desc
->
req_nodes
,
part_ptr
->
name
);
job_desc
->
req_nodes
,
part_ptr
->
name
);
error_code
=
ESLURM_REQUESTED_NODES_NOT_IN_PARTITION
;
error_code
=
ESLURM_REQUESTED_NODES_NOT_IN_PARTITION
;
goto
cleanup
;
goto
cleanup
;
}
}
...
@@ -1222,12 +1228,13 @@ job_create ( job_desc_msg_t *job_desc, uint32_t *new_job_id, int allocate,
...
@@ -1222,12 +1228,13 @@ job_create ( job_desc_msg_t *job_desc, uint32_t *new_job_id, int allocate,
}
}
if
(
job_desc
->
num_procs
>
part_ptr
->
total_cpus
)
{
if
(
job_desc
->
num_procs
>
part_ptr
->
total_cpus
)
{
info
(
"job_create: too many cpus (%d) requested of partition %s(%d)"
,
info
(
"job_create: too many cpus (%d) requested of partition %s(%d)"
,
job_desc
->
num_procs
,
part_ptr
->
name
,
part_ptr
->
total_cpus
);
job_desc
->
num_procs
,
part_ptr
->
name
,
part_ptr
->
total_cpus
);
error_code
=
ESLURM_TOO_MANY_REQUESTED_CPUS
;
error_code
=
ESLURM_TOO_MANY_REQUESTED_CPUS
;
goto
cleanup
;
goto
cleanup
;
}
}
if
((
job_desc
->
num_nodes
>
part_ptr
->
total_nodes
)
||
if
(
(
job_desc
->
num_nodes
>
part_ptr
->
total_nodes
)
||
(
job_desc
->
num_nodes
>
part_ptr
->
max_nodes
))
{
(
job_desc
->
num_nodes
>
part_ptr
->
max_nodes
)
)
{
if
(
part_ptr
->
total_nodes
>
part_ptr
->
max_nodes
)
if
(
part_ptr
->
total_nodes
>
part_ptr
->
max_nodes
)
i
=
part_ptr
->
max_nodes
;
i
=
part_ptr
->
max_nodes
;
else
else
...
@@ -1238,25 +1245,30 @@ job_create ( job_desc_msg_t *job_desc, uint32_t *new_job_id, int allocate,
...
@@ -1238,25 +1245,30 @@ job_create ( job_desc_msg_t *job_desc, uint32_t *new_job_id, int allocate,
goto
cleanup
;
goto
cleanup
;
}
}
/* Perform some size checks on strings we store to prevent malicious user */
/* Perform some size checks on strings we store to prevent
/* from filling slurmctld's memory */
* malicious user filling slurmctld's memory */
if
(
job_desc
->
err
&&
(
strlen
(
job_desc
->
err
)
>
BUF_SIZE
))
{
if
(
job_desc
->
err
&&
(
strlen
(
job_desc
->
err
)
>
BUF_SIZE
))
{
info
(
"job_create: strlen(err) too big (%d)"
,
strlen
(
job_desc
->
err
));
info
(
"job_create: strlen(err) too big (%d)"
,
strlen
(
job_desc
->
err
));
error_code
=
ESLURM_PATHNAME_TOO_LONG
;
error_code
=
ESLURM_PATHNAME_TOO_LONG
;
goto
cleanup
;
goto
cleanup
;
}
}
if
(
job_desc
->
in
&&
(
strlen
(
job_desc
->
in
)
>
BUF_SIZE
))
{
if
(
job_desc
->
in
&&
(
strlen
(
job_desc
->
in
)
>
BUF_SIZE
))
{
info
(
"job_create: strlen(in) too big (%d)"
,
strlen
(
job_desc
->
in
));
info
(
"job_create: strlen(in) too big (%d)"
,
strlen
(
job_desc
->
in
));
error_code
=
ESLURM_PATHNAME_TOO_LONG
;
error_code
=
ESLURM_PATHNAME_TOO_LONG
;
goto
cleanup
;
goto
cleanup
;
}
}
if
(
job_desc
->
out
&&
(
strlen
(
job_desc
->
out
)
>
BUF_SIZE
))
{
if
(
job_desc
->
out
&&
(
strlen
(
job_desc
->
out
)
>
BUF_SIZE
))
{
info
(
"job_create: strlen(out) too big (%d)"
,
strlen
(
job_desc
->
out
));
info
(
"job_create: strlen(out) too big (%d)"
,
strlen
(
job_desc
->
out
));
error_code
=
ESLURM_PATHNAME_TOO_LONG
;
error_code
=
ESLURM_PATHNAME_TOO_LONG
;
goto
cleanup
;
goto
cleanup
;
}
}
if
(
job_desc
->
work_dir
&&
(
strlen
(
job_desc
->
work_dir
)
>
BUF_SIZE
))
{
if
(
job_desc
->
work_dir
&&
(
strlen
(
job_desc
->
work_dir
)
>
BUF_SIZE
))
{
info
(
"job_create: strlen(work_dir) too big (%d)"
,
strlen
(
job_desc
->
work_dir
));
info
(
"job_create: strlen(work_dir) too big (%d)"
,
strlen
(
job_desc
->
work_dir
));
error_code
=
ESLURM_PATHNAME_TOO_LONG
;
error_code
=
ESLURM_PATHNAME_TOO_LONG
;
goto
cleanup
;
goto
cleanup
;
}
}
...
@@ -1266,14 +1278,18 @@ job_create ( job_desc_msg_t *job_desc, uint32_t *new_job_id, int allocate,
...
@@ -1266,14 +1278,18 @@ job_create ( job_desc_msg_t *job_desc, uint32_t *new_job_id, int allocate,
goto
cleanup
;
goto
cleanup
;
}
}
if
(
(
error_code
=
copy_job_desc_to_job_record
(
job_desc
,
job_rec_ptr
,
part_ptr
,
if
(
(
error_code
=
_copy_job_desc_to_job_record
(
job_desc
,
req_bitmap
)
)
)
{
job_rec_ptr
,
part_ptr
,
req_bitmap
)
)
)
{
error_code
=
ESLURM_ERROR_ON_DESC_TO_RECORD_COPY
;
error_code
=
ESLURM_ERROR_ON_DESC_TO_RECORD_COPY
;
goto
cleanup
;
goto
cleanup
;
}
}
if
(
job_desc
->
script
)
{
if
(
job_desc
->
script
)
{
if
(
(
error_code
=
copy_job_desc_to_file
(
job_desc
,
(
*
job_rec_ptr
)
->
job_id
)
)
)
{
if
(
(
error_code
=
_copy_job_desc_to_file
(
job_desc
,
(
*
job_rec_ptr
)
->
job_id
)
)
)
{
(
*
job_rec_ptr
)
->
job_state
=
JOB_FAILED
;
error_code
=
ESLURM_WRITING_TO_FILE
;
error_code
=
ESLURM_WRITING_TO_FILE
;
goto
cleanup
;
goto
cleanup
;
}
}
...
@@ -1284,23 +1300,23 @@ job_create ( job_desc_msg_t *job_desc, uint32_t *new_job_id, int allocate,
...
@@ -1284,23 +1300,23 @@ job_create ( job_desc_msg_t *job_desc, uint32_t *new_job_id, int allocate,
if
(
part_ptr
->
shared
==
SHARED_FORCE
)
/* shared=force */
if
(
part_ptr
->
shared
==
SHARED_FORCE
)
/* shared=force */
(
*
job_rec_ptr
)
->
details
->
shared
=
1
;
(
*
job_rec_ptr
)
->
details
->
shared
=
1
;
else
if
(((
*
job_rec_ptr
)
->
details
->
shared
!=
1
)
||
else
if
(
((
*
job_rec_ptr
)
->
details
->
shared
!=
1
)
||
(
part_ptr
->
shared
==
SHARED_NO
))
/*
user or partition w
ant
no
shar
ing
*/
(
part_ptr
->
shared
==
SHARED_NO
))
/*
c
an
'
t shar
e
*/
(
*
job_rec_ptr
)
->
details
->
shared
=
0
;
(
*
job_rec_ptr
)
->
details
->
shared
=
0
;
*
new_job_id
=
(
*
job_rec_ptr
)
->
job_id
;
*
new_job_id
=
(
*
job_rec_ptr
)
->
job_id
;
return
SLURM_SUCCESS
;
return
SLURM_SUCCESS
;
cleanup:
cleanup:
if
(
req_bitmap
)
if
(
req_bitmap
)
bit_free
(
req_bitmap
);
bit_free
(
req_bitmap
);
return
error_code
;
return
error_code
;
}
}
/* copy_job_desc_to_file - copy the job script and environment from the RPC
structure
/*
_
copy_job_desc_to_file - copy the job script and environment from the RPC
* into a file */
*
structure
into a file */
int
static
int
copy_job_desc_to_file
(
job_desc_msg_t
*
job_desc
,
uint32_t
job_id
)
_
copy_job_desc_to_file
(
job_desc_msg_t
*
job_desc
,
uint32_t
job_id
)
{
{
int
error_code
=
0
;
int
error_code
=
0
;
char
*
dir_name
,
job_dir
[
20
],
*
file_name
;
char
*
dir_name
,
job_dir
[
20
],
*
file_name
;
...
@@ -1316,14 +1332,18 @@ copy_job_desc_to_file ( job_desc_msg_t * job_desc , uint32_t job_id )
...
@@ -1316,14 +1332,18 @@ copy_job_desc_to_file ( job_desc_msg_t * job_desc , uint32_t job_id )
/* Create environment file, and write data to it */
/* Create environment file, and write data to it */
file_name
=
xstrdup
(
dir_name
);
file_name
=
xstrdup
(
dir_name
);
xstrcat
(
file_name
,
"/environment"
);
xstrcat
(
file_name
,
"/environment"
);
error_code
=
write_data_array_to_file
(
file_name
,
job_desc
->
environment
,
job_desc
->
env_size
);
error_code
=
_write_data_array_to_file
(
file_name
,
job_desc
->
environment
,
job_desc
->
env_size
);
xfree
(
file_name
);
xfree
(
file_name
);
/* Create script file */
if
(
error_code
==
0
)
{
file_name
=
xstrdup
(
dir_name
);
/* Create script file */
xstrcat
(
file_name
,
"/script"
);
file_name
=
xstrdup
(
dir_name
);
error_code
=
write_data_to_file
(
file_name
,
job_desc
->
script
);
xstrcat
(
file_name
,
"/script"
);
xfree
(
file_name
);
error_code
=
_write_data_to_file
(
file_name
,
job_desc
->
script
);
xfree
(
file_name
);
}
xfree
(
dir_name
);
xfree
(
dir_name
);
return
error_code
;
return
error_code
;
...
@@ -1383,8 +1403,8 @@ rmdir2 (char * path)
...
@@ -1383,8 +1403,8 @@ rmdir2 (char * path)
}
}
/* Create file with specified name and write the supplied data array to it */
/* Create file with specified name and write the supplied data array to it */
int
static
int
write_data_array_to_file
(
char
*
file_name
,
char
**
data
,
uint16_t
size
)
_
write_data_array_to_file
(
char
*
file_name
,
char
**
data
,
uint16_t
size
)
{
{
int
fd
,
i
,
pos
,
nwrite
,
amount
;
int
fd
,
i
,
pos
,
nwrite
,
amount
;
...
@@ -1426,8 +1446,8 @@ write_data_array_to_file ( char * file_name, char ** data, uint16_t size )
...
@@ -1426,8 +1446,8 @@ write_data_array_to_file ( char * file_name, char ** data, uint16_t size )
}
}
/* Create file with specified name and write the supplied data to it */
/* Create file with specified name and write the supplied data to it */
int
static
int
write_data_to_file
(
char
*
file_name
,
char
*
data
)
_
write_data_to_file
(
char
*
file_name
,
char
*
data
)
{
{
int
fd
,
pos
,
nwrite
,
amount
;
int
fd
,
pos
,
nwrite
,
amount
;
...
@@ -1588,10 +1608,10 @@ read_data_from_file ( char * file_name, char ** data)
...
@@ -1588,10 +1608,10 @@ read_data_from_file ( char * file_name, char ** data)
return
;
return
;
}
}
/* copy_job_desc_to_job_record - copy the job descriptor from the RPC
structure
/*
_
copy_job_desc_to_job_record - copy the job descriptor from the RPC
* into the actual slurmctld job record */
*
structure
into the actual slurmctld job record */
int
static
int
copy_job_desc_to_job_record
(
job_desc_msg_t
*
job_desc
,
_
copy_job_desc_to_job_record
(
job_desc_msg_t
*
job_desc
,
struct
job_record
**
job_rec_ptr
,
struct
part_record
*
part_ptr
,
struct
job_record
**
job_rec_ptr
,
struct
part_record
*
part_ptr
,
bitstr_t
*
req_bitmap
)
bitstr_t
*
req_bitmap
)
{
{
...
@@ -1612,12 +1632,13 @@ copy_job_desc_to_job_record ( job_desc_msg_t * job_desc ,
...
@@ -1612,12 +1632,13 @@ copy_job_desc_to_job_record ( job_desc_msg_t * job_desc ,
add_job_hash
(
job_ptr
);
add_job_hash
(
job_ptr
);
if
(
job_desc
->
name
)
{
if
(
job_desc
->
name
)
{
strncpy
(
job_ptr
->
name
,
job_desc
->
name
,
sizeof
(
job_ptr
->
name
))
;
strncpy
(
job_ptr
->
name
,
job_desc
->
name
,
sizeof
(
job_ptr
->
name
))
;
}
}
job_ptr
->
user_id
=
(
uid_t
)
job_desc
->
user_id
;
job_ptr
->
user_id
=
(
uid_t
)
job_desc
->
user_id
;
job_ptr
->
job_state
=
JOB_PENDING
;
job_ptr
->
job_state
=
JOB_PENDING
;
job_ptr
->
time_limit
=
job_desc
->
time_limit
;
job_ptr
->
time_limit
=
job_desc
->
time_limit
;
if
((
job_desc
->
priority
!=
NO_VAL
)
/* also check
that
submit UID is root */
)
if
((
job_desc
->
priority
!=
NO_VAL
)
/* also check submit UID is root */
)
job_ptr
->
priority
=
job_desc
->
priority
;
job_ptr
->
priority
=
job_desc
->
priority
;
else
else
set_job_prio
(
job_ptr
);
set_job_prio
(
job_ptr
);
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment