Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
Slurm
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
tud-zih-energy
Slurm
Commits
b0460a3d
Commit
b0460a3d
authored
11 years ago
by
Danny Auble
Browse files
Options
Downloads
Patches
Plain Diff
NRT - Fix to supply correct error messages to poe/pmd when a launch fails.
parent
324a089e
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
NEWS
+1
-0
1 addition, 0 deletions
NEWS
src/plugins/switch/nrt/libpermapi/shr_64.c
+60
-39
60 additions, 39 deletions
src/plugins/switch/nrt/libpermapi/shr_64.c
with
61 additions
and
39 deletions
NEWS
+
1
−
0
View file @
b0460a3d
...
...
@@ -27,6 +27,7 @@ documents those changes that are of interest to users and admins.
-- Fixed minor memory leak in backfill scheduler.
-- MYSQL - Fixed memory leak when querying clusters.
-- MYSQL - Fix when updating QOS on an association.
-- NRT - Fix to supply correct error messages to poe/pmd when a launch fails.
* Changes in Slurm 2.6.6
========================
...
...
This diff is collapsed.
Click to expand it.
src/plugins/switch/nrt/libpermapi/shr_64.c
+
60
−
39
View file @
b0460a3d
...
...
@@ -89,6 +89,7 @@ static bool slurm_started = false;
static
log_options_t
log_opts
=
LOG_OPTS_STDERR_ONLY
;
static
host_usage_t
*
host_usage
=
NULL
;
static
hostlist_t
total_hl
=
NULL
;
static
int
err_msg_len
=
400
;
int
sig_array
[]
=
{
SIGINT
,
SIGQUIT
,
SIGCONT
,
SIGTERM
,
SIGHUP
,
...
...
@@ -467,8 +468,8 @@ static char *_uint16_array_to_str(int array_len, const uint16_t *array)
for
(
i
=
0
;
i
<
array_len
;
i
++
)
{
if
((
i
+
1
<
array_len
)
&&
(
array
[
i
]
==
array
[
i
+
1
]))
{
previous
++
;
continue
;
previous
++
;
continue
;
}
if
(
i
==
array_len
-
1
)
/* last time through loop */
...
...
@@ -932,7 +933,9 @@ extern int pe_rm_connect(rmhandle_t resource_mgr,
debug
(
"got pe_rm_connect called"
);
launch_common_set_stdio_fds
(
job
,
&
cio_fds
);
}
else
{
*
error_msg
=
xstrdup_printf
(
"pe_rm_connect: unknown caller"
);
*
error_msg
=
malloc
(
sizeof
(
char
)
*
err_msg_len
);
snprintf
(
*
error_msg
,
err_msg_len
,
"pe_rm_connect: unknown caller"
);
error
(
"%s"
,
*
error_msg
);
return
-
1
;
}
...
...
@@ -945,9 +948,10 @@ extern int pe_rm_connect(rmhandle_t resource_mgr,
if
(
!
name
)
{
if
(
hl
)
hostlist_destroy
(
hl
);
*
error_msg
=
xstrdup_printf
(
"pe_rm_connect: unknown host for ip %s"
,
connect_param
->
machine_name
[
i
]);
*
error_msg
=
malloc
(
sizeof
(
char
)
*
err_msg_len
);
snprintf
(
*
error_msg
,
err_msg_len
,
"pe_rm_connect: unknown host for ip %s"
,
connect_param
->
machine_name
[
i
]);
error
(
"%s"
,
*
error_msg
);
return
-
1
;
}
...
...
@@ -964,10 +968,11 @@ extern int pe_rm_connect(rmhandle_t resource_mgr,
}
if
(
!
hl
)
{
*
error_msg
=
xstrdup_printf
(
"pe_rm_connect: machine_count 0? it came in as "
"%d but we didn't get a hostlist"
,
connect_param
->
machine_count
);
*
error_msg
=
malloc
(
sizeof
(
char
)
*
err_msg_len
);
snprintf
(
*
error_msg
,
err_msg_len
,
"pe_rm_connect: machine_count 0? it came in as "
"%d but we didn't get a hostlist"
,
connect_param
->
machine_count
);
error
(
"%s"
,
*
error_msg
);
return
-
1
;
}
...
...
@@ -993,9 +998,10 @@ extern int pe_rm_connect(rmhandle_t resource_mgr,
node_cnt
,
&
task_num
)
!=
SLURM_SUCCESS
)
{
xfree
(
total_node_list
);
*
error_msg
=
xstrdup_printf
(
"pe_rm_connect: problem with hack: %s"
,
slurm_strerror
(
errno
));
*
error_msg
=
malloc
(
sizeof
(
char
)
*
err_msg_len
);
snprintf
(
*
error_msg
,
err_msg_len
,
"pe_rm_connect: problem with hack: %s"
,
slurm_strerror
(
errno
));
error
(
"%s"
,
*
error_msg
);
return
-
1
;
}
...
...
@@ -1008,9 +1014,10 @@ extern int pe_rm_connect(rmhandle_t resource_mgr,
step_callbacks
.
step_timeout
=
_self_timeout
;
if
(
launch_g_step_launch
(
job
,
&
cio_fds
,
&
global_rc
,
&
step_callbacks
))
{
*
error_msg
=
xstrdup_printf
(
"pe_rm_connect: problem with launch: %s"
,
slurm_strerror
(
errno
));
*
error_msg
=
malloc
(
sizeof
(
char
)
*
err_msg_len
);
snprintf
(
*
error_msg
,
err_msg_len
,
"pe_rm_connect: problem with launch: %s"
,
slurm_strerror
(
errno
));
error
(
"%s"
,
*
error_msg
);
return
-
1
;
}
...
...
@@ -1019,16 +1026,19 @@ extern int pe_rm_connect(rmhandle_t resource_mgr,
SLURM_STEP_CTX_USER_MANAGED_SOCKETS
,
&
fd_cnt
,
&
ctx_sockfds
);
if
(
ctx_sockfds
==
NULL
)
{
*
error_msg
=
xstrdup_printf
(
"pe_rm_connect: Unable to get pmd IO socket array %d"
,
rc
);
*
error_msg
=
malloc
(
sizeof
(
char
)
*
err_msg_len
);
snprintf
(
*
error_msg
,
err_msg_len
,
"pe_rm_connect: Unable to get pmd IO socket array %d"
,
rc
);
error
(
"%s"
,
*
error_msg
);
return
-
1
;
}
if
(
fd_cnt
!=
task_num
)
{
*
error_msg
=
xstrdup_printf
(
"pe_rm_connect: looking for %d sockets but got back %d"
,
connect_param
->
machine_count
,
fd_cnt
);
*
error_msg
=
malloc
(
sizeof
(
char
)
*
err_msg_len
);
snprintf
(
*
error_msg
,
err_msg_len
,
"pe_rm_connect: looking for %d sockets but "
"got back %d"
,
connect_param
->
machine_count
,
fd_cnt
);
error
(
"%s"
,
*
error_msg
);
return
-
1
;
}
...
...
@@ -1169,7 +1179,9 @@ extern int pe_rm_get_event(rmhandle_t resource_mgr, job_event_t **job_event,
debug
(
"pe_rm_get_event called"
);
return
0
;
}
else
if
(
pm_type
!=
PM_POE
)
{
*
error_msg
=
xstrdup_printf
(
"pe_rm_get_event: unknown caller"
);
*
error_msg
=
malloc
(
sizeof
(
char
)
*
err_msg_len
);
snprintf
(
*
error_msg
,
err_msg_len
,
"pe_rm_get_event: unknown caller"
);
error
(
"%s"
,
*
error_msg
);
return
-
1
;
}
...
...
@@ -1238,8 +1250,9 @@ extern int pe_rm_get_job_info(rmhandle_t resource_mgr, job_info_t **job_info,
debug
(
"pe_rm_get_job_info called"
);
return
0
;
}
else
if
(
pm_type
!=
PM_POE
)
{
*
error_msg
=
xstrdup_printf
(
"pe_rm_get_job_info: unknown caller"
);
*
error_msg
=
malloc
(
sizeof
(
char
)
*
err_msg_len
);
snprintf
(
*
error_msg
,
err_msg_len
,
"pe_rm_get_job_info: unknown caller"
);
error
(
"%s"
,
*
error_msg
);
return
-
1
;
}
...
...
@@ -1266,8 +1279,9 @@ extern int pe_rm_get_job_info(rmhandle_t resource_mgr, job_info_t **job_info,
slurm_step_ctx_get
(
job
->
step_ctx
,
SLURM_STEP_CTX_RESP
,
&
resp
);
if
(
!
resp
)
{
*
error_msg
=
xstrdup_printf
(
"pe_rm_get_job_info: no step response in step ctx"
);
*
error_msg
=
malloc
(
sizeof
(
char
)
*
err_msg_len
);
snprintf
(
*
error_msg
,
err_msg_len
,
"pe_rm_get_job_info: no step response in step ctx"
);
error
(
"%s"
,
*
error_msg
);
return
-
1
;
}
...
...
@@ -1653,7 +1667,7 @@ extern int pe_rm_init(int *rmapi_version, rmhandle_t *resource_mgr, char *rm_id,
xstrcat
(
opt
.
network
,
"bulk_xfer"
);
}
xfree
(
bulk_xfer
);
xfree
(
collectives
);
xfree
(
collectives
);
xfree
(
euidevice
);
xfree
(
euilib
);
xfree
(
immediate
);
...
...
@@ -1690,17 +1704,19 @@ extern int pe_rm_init(int *rmapi_version, rmhandle_t *resource_mgr, char *rm_id,
if
((
srun_debug
=
getenv
(
"SLURM_STEP_ID"
)))
step_id
=
atoi
(
srun_debug
);
if
(
job_id
==
-
1
||
step_id
==
-
1
)
{
*
error_msg
=
xstrdup_printf
(
"pe_rm_init: SLURM_JOB_ID or SLURM_STEP_ID "
"not found %d.%d"
,
job_id
,
step_id
);
*
error_msg
=
malloc
(
sizeof
(
char
)
*
err_msg_len
);
snprintf
(
*
error_msg
,
err_msg_len
,
"pe_rm_init: SLURM_JOB_ID or SLURM_STEP_ID "
"not found %d.%d"
,
job_id
,
step_id
);
error
(
"%s"
,
*
error_msg
);
return
-
1
;
}
job
=
_read_job_srun_agent
();
if
(
!
job
)
{
*
error_msg
=
xstrdup_printf
(
"pe_rm_init: no job created"
);
*
error_msg
=
malloc
(
sizeof
(
char
)
*
err_msg_len
);
snprintf
(
*
error_msg
,
err_msg_len
,
"pe_rm_init: no job created"
);
error
(
"%s"
,
*
error_msg
);
return
-
1
;
}
...
...
@@ -1715,7 +1731,9 @@ extern int pe_rm_init(int *rmapi_version, rmhandle_t *resource_mgr, char *rm_id,
* PMD to fanout child processes on other nodes */
_spawn_fe_agent
();
}
else
{
*
error_msg
=
xstrdup_printf
(
"pe_rm_init: unknown caller"
);
*
error_msg
=
malloc
(
sizeof
(
char
)
*
err_msg_len
);
snprintf
(
*
error_msg
,
err_msg_len
,
"pe_rm_init: unknown caller"
);
error
(
"%s"
,
*
error_msg
);
return
-
1
;
}
...
...
@@ -1749,7 +1767,7 @@ extern int pe_rm_send_event(rmhandle_t resource_mgr, job_event_t *job_event,
if
((
job_event
->
event
==
JOB_CKPT_COMPLETE
)
&&
job
)
{
struct
ckpt_end_data
*
ckpt_end_ptr
=
(
struct
ckpt_end_data
*
)
job_event
->
event_data
;
job_event
->
event_data
;
rc
=
slurm_checkpoint_complete
(
job
->
jobid
,
job
->
stepid
,
ckpt_end_ptr
->
ckpt_start_time
,
ckpt_end_ptr
->
ckpt_rc
,
...
...
@@ -1800,7 +1818,9 @@ int pe_rm_submit_job(rmhandle_t resource_mgr, job_command_t job_cmd,
if
(
slurm_cmd_fname
)
poe_cmd_fname
=
getenv
(
"MP_CMDFILE"
);
}
else
{
*
error_msg
=
xstrdup_printf
(
"pe_rm_submit_job: unknown caller"
);
*
error_msg
=
malloc
(
sizeof
(
char
)
*
err_msg_len
);
snprintf
(
*
error_msg
,
err_msg_len
,
"pe_rm_submit_job: unknown caller"
);
error
(
"%s"
,
*
error_msg
);
return
-
1
;
}
...
...
@@ -1808,9 +1828,10 @@ int pe_rm_submit_job(rmhandle_t resource_mgr, job_command_t job_cmd,
debug
(
"got pe_rm_submit_job called %d"
,
job_cmd
.
job_format
);
if
(
job_cmd
.
job_format
!=
1
)
{
/* We don't handle files */
*
error_msg
=
xstrdup_printf
(
"pe_rm_submit_job: SLURM doesn't handle files "
"to submit_job"
);
*
error_msg
=
malloc
(
sizeof
(
char
)
*
err_msg_len
);
snprintf
(
*
error_msg
,
err_msg_len
,
"pe_rm_submit_job: SLURM doesn't handle files "
"to submit_job"
);
error
(
"%s"
,
*
error_msg
);
return
-
1
;
}
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment