Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
Slurm
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
tud-zih-energy
Slurm
Commits
8504a2a6
Commit
8504a2a6
authored
16 years ago
by
Moe Jette
Browse files
Options
Downloads
Patches
Plain Diff
Start of BASIL interface, needed for Cray
parent
87d16982
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/slurmctld/basil_interface.c
+259
-0
259 additions, 0 deletions
src/slurmctld/basil_interface.c
with
259 additions
and
0 deletions
src/slurmctld/basil_interface.c
0 → 100644
+
259
−
0
View file @
8504a2a6
/*****************************************************************************\
* basil_interface.c - slurmctld interface to BASIL, Cray's Batch Application
* Scheduler Interface Layer (BASIL)
*****************************************************************************
* Copyright (C) 2009 Lawrence Livermore National Security.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov>
* LLNL-CODE-402394.
*
* This file is part of SLURM, a resource management program.
* For details, see <http://www.llnl.gov/linux/slurm/>.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
/* FIXME: In slurmctld/slurmctld.h, add node_ptr->basil_node_id, init to NO_VAL */
/* FIXME: In slurmctld/node_mgr.c, make _sync_bitmaps() extern */
/* FIXME: In common/node_select.c, add reservation_id to select_job */
/* FIXME: Document, ALPS must be started before SLURM */
#include
<slurm/slurm_errno.h>
#include
<string.h>
#include
"src/common/log.h"
#include
"src/common/xmalloc.h"
#include
"src/common/xstring.h"
#include
"src/slurmctld/slurmctld.h"
#define BASIL_DEBUG 1
#ifndef HAVE_BASIL
static
int
last_res_id
=
0
;
#endif
#ifdef HAVE_BASIL
/* Make sure that each SLURM node has a BASIL node ID */
static
void
_validate_basil_node_id
(
void
)
{
uint16_t
base_state
;
int
i
;
struct
node_record
*
node_ptr
=
node_record_table_ptr
;
for
(
i
=
0
;
i
<
node_record_cnt
;
i
++
,
node_ptr
++
)
if
(
node_ptr
->
basil_node_id
!=
NO_VAL
)
continue
;
base_state
=
node_ptr
->
state
&
NODE_STATE_BASE
;
if
(
base_state
==
NODE_STATE_DOWN
)
continue
;
error
(
"Node %s has no basil node_id"
,
node_ptr
->
name
);
last_node_update
=
time
(
NULL
);
set_node_down
(
node_ptr
->
name
,
"No BASIL node_id"
);
_sync_bitmaps
(
node_ptr
,
0
);
}
}
#endif
/* HAVE_BASIL */
/*
* basil_query - Query BASIL for node and reservation state.
* Execute once at slurmctld startup and periodically thereafter.
* RET 0 or error code
*/
extern
int
basil_query
(
void
)
{
int
error_code
=
SLURM_SUCCESS
;
#ifdef HAVE_BASIL
struct
config_record
*
config_ptr
;
struct
node_record
*
node_ptr
;
struct
job_record
*
job_ptr
;
ListIterator
job_iterator
;
uint16_t
base_state
;
char
*
reason
,
*
res_id
;
/* Issue the BASIL QUERY request */
if
(
request_failure
)
{
fatal
(
"basil query error: %s"
,
"TBD"
);
return
SLURM_ERROR
;
}
debug
(
"basil query initiated"
);
/* Validate configuration for each node that BASIL reports */
for
(
each_basil_node
)
{
#if BASIL_DEBUG
/* Log node state according to BASIL */
info
(
"basil query: name=%s arch=%s"
,
basil_node_name
,
basil_node_arch
,
etc
.);
#endif
/* BASIL_DEBUG */
/* NOTE: Cray should provide X-, Y- and Z-coordinates
* in the future. When that happens, we'll want to use
* those numbers to generate the hostname:
* slurm_host_name = xmalloc(sizeof(conf->node_prefix) + 4);
* sprintf(slurm_host_name: %s%d%d%d", basil_node_name, X, Y, Z);
*/
node_ptr
=
find_node_record
(
basil_node_name
);
if
(
node_ptr
==
NULL
)
{
error
(
"basil node %s not found in slurm"
,
basil_node_name
);
continue
;
}
/* Record BASIL's node_id for use in reservations */
node_ptr
->
basil_node_id
=
basil_node_id
;
/* Update slurmctld's node architecture */
if
(
node_ptr
->
arch
==
NULL
)
{
xfree
(
node_ptr
->
arch
);
node_ptr
->
arch
=
xstrdup
(
basil_node_arch
);
}
/* Update slurmctld's node state if necessary */
reason
=
NULL
;
base_state
=
node_ptr
->
state
&
NODE_STATE_BASE
;
if
(
base_state
!=
NODE_STATE_DOWN
)
{
if
(
strcmp
(
basil_state
,
"UP"
))
reason
=
"basil state not UP"
;
else
if
(
strcmp
(
basil_role
,
"BATCH"
))
reason
=
"basil role not BATCH"
;
}
/* Calculate the total count of processors and
* MB of memory on the node */
config_ptr
=
node_ptr
->
config_ptr
;
if
((
slurmctld_conf
.
fast_schedule
!=
2
)
&&
(
basil_cpus
<
config_ptr
->
cpus
))
{
error
(
"Node %s has low cpu count %d"
,
node_ptr
->
name
,
basil_cpus
);
reason
=
"Low CPUs"
;
}
node_ptr
->
cpus
=
basil_cpus
;
if
((
slurmctld_conf
.
fast_schedule
!=
2
)
&&
(
basil_memory
<
config_ptr
->
real_memory
))
{
error
(
"Node %s has low real_memory size %d"
,
node_ptr
->
name
,
basil_memory
);
reason
=
"Low RealMemory"
;
}
node_ptr
->
real_memory
=
basil_memory
;
if
(
reason
)
{
last_node_update
=
time
(
NULL
);
set_node_down
(
node_ptr
->
name
,
reason
);
_sync_bitmaps
(
node_ptr
,
0
);
}
}
_validate_basil_node_id
();
/* Validate that each BASIL reservation is still valid,
* purge vestigial reservations */
for
(
each_basil_reservation
)
{
bool
found
=
false
;
job_iterator
=
list_iterator_create
(
job_list
);
while
((
job_ptr
=
(
struct
job_record
*
)
list_next
(
job_iterator
)))
{
select_g_get_jobinfo
(
job_ptr
->
select_jobinfo
,
SELECT_DATA_BLOCK_ID
,
&
res_id
);
found
=
!
strcmp
(
res_id
,
basil_reservation_id
);
xfree
(
res_id
);
if
(
found
)
break
;
}
list_iterator_destroy
(
job_iterator
);
if
(
found
)
{
error
(
"vestigial basil reservation %s being removed"
,
basil_reservation_id
);
basil_dealloc
(
basil_reservation_id
);
}
}
#else
struct
job_record
*
job_ptr
;
ListIterator
job_iterator
;
char
*
res_id
,
*
tmp
;
job_iterator
=
list_iterator_create
(
job_list
);
while
((
job_ptr
=
(
struct
job_record
*
)
list_next
(
job_iterator
)))
{
select_g_get_jobinfo
(
job_ptr
->
select_jobinfo
,
SELECT_DATA_BLOCK_ID
,
&
res_id
);
tmp
=
strchr
(
res_id
,
'_'
);
if
(
tmp
)
{
job_res_id
=
atoi
(
tmp
+
1
);
last_res_id
=
MAX
(
last_res_id
,
job_res_id
);
}
xfree
(
res_id
);
}
list_iterator_destroy
(
job_iterator
);
debug
(
"basil_query() executed, last_res_id=%d"
,
last_res_id
);
#endif
/* HAVE_BASIL */
return
error_code
;
}
/*
* basil_reserve - create a BASIL reservation.
* IN job_ptr - pointer to job which has just been allocated resources
* RET 0 or error code
*/
extern
int
basil_reserve
(
struct
job_record
*
job_ptr
)
{
int
error_code
=
SLURM_SUCCESS
;
#ifdef HAVE_BASIL
/* Issue the BASIL RESERVE request */
if
(
request_failure
)
{
error
(
"basil reserve error: %s"
,
"TBD"
);
return
SLURM_ERROR
;
}
debug
(
"basil reservation made job_id=%u res_id=%s"
,
job_ptr
->
job_id
,
reservation_id
);
/* FIXME: add reservation_id to select_job_struct */
#else
char
*
reservation_id
;
xstrfmtcat
(
reservation_id
,
"RES_%d"
,
++
last_res_id
);
debug
(
"basil reservation made job_id=%u res_id=%s"
,
job_ptr
->
job_id
,
reservation_id
);
/* FIXME: add reservation_id to select_job_struct */
#endif
/* HAVE_BASIL */
return
error_code
;
}
/*
* basil_release - release a BASIL reservation.
* IN reservation_id - ID of reservation to release
* RET 0 or error code
*/
extern
int
basil_release
(
char
*
reservation_id
)
{
int
error_code
=
SLURM_SUCCESS
;
#ifdef HAVE_BASIL
/* Issue the BASIL RELEASE request */
if
(
request_failure
)
{
error
(
"basil release of %s error: %s"
,
reservation_id
,
"TBD"
);
return
SLURM_ERROR
;
}
debug
(
"basil release of %s complete"
,
reservation_id
);
#else
debug
(
"basil release of %s complete"
,
reservation_id
);
#endif
/* HAVE_BASIL */
return
error_code
;
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment