diff --git a/src/api/allocate.c b/src/api/allocate.c index 7bcdc28ed11c3f33b078a35813f195215dc1627b..5d7389dc5a4b36c3414dfeb7a30c6a0d1d50f84a 100644 --- a/src/api/allocate.c +++ b/src/api/allocate.c @@ -20,7 +20,7 @@ /* slurm_allocate_resources - allocated resources for a job request */ int -slurm_allocate_resources (job_desc_msg_t * job_desc_msg , job_allocation_response_msg_t * job_alloc_resp_msg, int immediate ) +slurm_allocate_resources (job_desc_msg_t * job_desc_msg , resource_allocation_response_msg_t * job_alloc_resp_msg, int immediate ) { int msg_size ; int rc ; @@ -71,7 +71,7 @@ slurm_allocate_resources (job_desc_msg_t * job_desc_msg , job_allocation_respons return SLURM_SUCCESS ; } -int slurm_job_will_run (job_desc_msg_t * job_desc_msg , job_allocation_response_msg_t * job_alloc_resp_msg ) +int slurm_job_will_run (job_desc_msg_t * job_desc_msg , resource_allocation_response_msg_t * job_alloc_resp_msg ) { int msg_size ; int rc ; diff --git a/src/api/slurm.h b/src/api/slurm.h index e3bf191bae0426b9080ced1f79bd1afcdee3ce54..592511633024c5e2507b59458e89a02c8ccc07ab 100644 --- a/src/api/slurm.h +++ b/src/api/slurm.h @@ -75,7 +75,7 @@ enum node_states { * TotalProcs=<count> * NOTE: the calling function must free the allocated storage at node_list[0] */ -extern int slurm_allocate_resources (job_desc_msg_t * job_desc_msg , job_allocation_response_msg_t * job_alloc_resp_msg, int immediate ) ; +extern int slurm_allocate_resources (job_desc_msg_t * job_desc_msg , resource_allocation_response_msg_t * job_alloc_resp_msg, int immediate ) ; /* * slurm_cancel - cancel the specified job diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 9c792e83eda5db8670590c7c672397c6a6693b69..a1932ea18b24f887af1be53d80f7f0bafc580a07 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -194,14 +194,18 @@ void slurm_free_node_table ( node_table_t * node ) } } -void slurm_free_job_allocation_response_msg ( job_allocation_response_msg_t * job_alloc_resp_msg ) +void slurm_free_resource_allocation_response_msg ( resource_allocation_response_msg_t * msg ) { - if ( job_alloc_resp_msg ) + if ( msg ) { - if ( job_alloc_resp_msg -> node_list ) - xfree ( job_alloc_resp_msg -> node_list); - xfree ( job_alloc_resp_msg ) ; - } + if ( msg->node_list ) + xfree ( msg->node_list ) ; + if ( msg->cpus_per_node ) + xfree ( msg->cpus_per_node ) ; + if ( msg->cpu_count_reps ) + xfree ( msg->cpu_count_reps ) ; + xfree ( msg ) ; + } } void slurm_free_node_registration_status_msg ( slurm_node_registration_status_msg_t * msg ) diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index b2d4fb09459fb3c06f2bbd44c4a1e35adacf92d6..1ab0dce145adc448b8acd81b31fc86638f0b8cac 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -167,11 +167,13 @@ typedef struct kill_tasks_msg } kill_tasks_msg_t ; -typedef struct slurm_job_allocation_response_msg +typedef struct resource_allocation_response_msg { uint32_t job_id; char* node_list; -} job_allocation_response_msg_t ; + uint32_t* cpus_per_node; + uint32_t* cpu_count_reps; +} resource_allocation_response_msg_t ; typedef struct job_desc_msg { /* Job descriptor for submit, allocate, and update requests */ uint16_t contiguous; /* 1 if job requires contiguous nodes, 0 otherwise, @@ -322,6 +324,8 @@ void inline slurm_free_job_id_msg ( job_id_msg_t * msg ) ; void inline slurm_free_ctl_conf ( slurm_ctl_conf_info_msg_t * build_ptr ) ; void inline slurm_free_job_desc_msg ( job_desc_msg_t * msg ) ; +void inline slurm_free_resource_allocation_response_msg( resource_allocation_response_msg_t * msg ); + void inline slurm_free_node_registration_status_msg ( slurm_node_registration_status_msg_t * msg ) ; void inline slurm_free_job_info ( job_info_msg_t * msg ) ; diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 39e90e399599a859eb875067f2d57517f8f06019..4554aa395d86d552a7663593bbca40e3fbf666de 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -84,7 +84,7 @@ int pack_msg ( slurm_msg_t const * msg , char ** buffer , uint32_t * buf_len ) case RESPONSE_RESOURCE_ALLOCATION : case RESPONSE_IMMEDIATE_RESOURCE_ALLOCATION : case RESPONSE_JOB_WILL_RUN : - pack_job_allocation_response_msg ( ( job_allocation_response_msg_t * ) msg -> data , ( void ** ) buffer , buf_len ) ; + pack_job_allocation_response_msg ( ( resource_allocation_response_msg_t * ) msg -> data , ( void ** ) buffer , buf_len ) ; break ; case REQUEST_UPDATE_NODE : pack_update_node_msg ( ( update_node_msg_t * ) msg-> data , ( void ** ) buffer , buf_len ) ; @@ -198,7 +198,7 @@ int unpack_msg ( slurm_msg_t * msg , char ** buffer , uint32_t * buf_len ) case RESPONSE_RESOURCE_ALLOCATION : case RESPONSE_IMMEDIATE_RESOURCE_ALLOCATION : case RESPONSE_JOB_WILL_RUN : - unpack_job_allocation_response_msg ( ( job_allocation_response_msg_t ** ) & ( msg -> data ) , ( void ** ) buffer , buf_len ) ; + unpack_job_allocation_response_msg ( ( resource_allocation_response_msg_t ** ) & ( msg -> data ) , ( void ** ) buffer , buf_len ) ; break ; case REQUEST_UPDATE_NODE : @@ -289,18 +289,18 @@ int unpack_update_node_msg ( update_node_msg_t ** msg , void ** buffer , uint32_ return 0 ; } -void pack_job_allocation_response_msg ( job_allocation_response_msg_t * msg, void ** buffer , uint32_t * length ) +void pack_job_allocation_response_msg ( resource_allocation_response_msg_t * msg, void ** buffer , uint32_t * length ) { pack32 ( msg -> job_id , ( void ** ) buffer , length ) ; packstr ( msg -> node_list , ( void ** ) buffer , length ) ; } -int unpack_job_allocation_response_msg ( job_allocation_response_msg_t ** msg , void ** buffer , uint32_t * length ) +int unpack_job_allocation_response_msg ( resource_allocation_response_msg_t ** msg , void ** buffer , uint32_t * length ) { uint16_t uint16_tmp; - job_allocation_response_msg_t * tmp_ptr ; + resource_allocation_response_msg_t * tmp_ptr ; /* alloc memory for structure */ - tmp_ptr = xmalloc ( sizeof ( job_allocation_response_msg_t ) ) ; + tmp_ptr = xmalloc ( sizeof ( resource_allocation_response_msg_t ) ) ; if (tmp_ptr == NULL) { return ENOMEM; diff --git a/src/common/slurm_protocol_pack.h b/src/common/slurm_protocol_pack.h index e70fc0eba1aa0b1bfc5dedd4036065c6b2d5c6e9..7332b88e8d3645755a58249ee90822e38dddf58e 100644 --- a/src/common/slurm_protocol_pack.h +++ b/src/common/slurm_protocol_pack.h @@ -58,8 +58,8 @@ int unpack_node_info_msg ( node_info_msg_t ** msg , void ** buf_ptr , int * buff int unpack_node_table_msg ( node_table_msg_t ** node , void ** buf_ptr , int * buffer_size ); int unpack_node_table ( node_table_msg_t * node , void ** buf_ptr , int * buffer_size ); -void pack_job_allocation_response_msg ( job_allocation_response_msg_t * msg, void ** buffer , uint32_t * length ); -int unpack_job_allocation_response_msg ( job_allocation_response_msg_t ** msg , void ** buffer , uint32_t * length ); +void pack_job_allocation_response_msg ( resource_allocation_response_msg_t * msg, void ** buffer , uint32_t * length ); +int unpack_job_allocation_response_msg ( resource_allocation_response_msg_t ** msg , void ** buffer , uint32_t * length ); void pack_update_node_msg ( update_node_msg_t * msg, void ** buffer , uint32_t * length ); int unpack_update_node_msg ( update_node_msg_t ** msg , void ** buffer , uint32_t * length ); diff --git a/src/slurmctld/Makefile.am b/src/slurmctld/Makefile.am index d2e60d5feadc0598433626ee9d07782907a625c4..9f9877af169baf23c684b8d16e14148ba002dbdc 100644 --- a/src/slurmctld/Makefile.am +++ b/src/slurmctld/Makefile.am @@ -25,8 +25,6 @@ LDADD = $(top_srcdir)/src/common/libcommon.la slurmctld_SOURCES = \ slurmctld.h \ - util.c \ - util.h \ parse_spec.c \ controller.c \ job_mgr.c \ diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 9e3e8519d92ea1a5a05b5934408a15a9b7c262b8..3de62adfd3e1281673d3b1654c62a5160283602e 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -48,7 +48,8 @@ slurm_ctl_conf_t slurmctld_conf; int msg_from_root (void); void slurmctld_req ( slurm_msg_t * msg ); -void fill_build_table ( slurm_ctl_conf_t * build_ptr ); +void fill_ctld_conf ( slurm_ctl_conf_t * build_ptr ); +void parse_commandline( int argc, char* argv[], slurm_ctl_conf_t * ); inline static void slurm_rpc_dump_build ( slurm_msg_t * msg ) ; inline static void slurm_rpc_dump_nodes ( slurm_msg_t * msg ) ; inline static void slurm_rpc_dump_partitions ( slurm_msg_t * msg ) ; @@ -58,9 +59,11 @@ inline static void slurm_rpc_submit_batch_job ( slurm_msg_t * msg ) ; inline static void slurm_rpc_reconfigure_controller ( slurm_msg_t * msg ) ; inline static void slurm_rpc_node_registration ( slurm_msg_t * msg ) ; inline static void slurm_rpc_register_node_status ( slurm_msg_t * msg ) ; -inline static void slurm_rpc_allocate_resources_immediately ( slurm_msg_t * msg ) ; -inline static void slurm_rpc_allocate_resources ( slurm_msg_t * msg ) ; inline static void slurm_rpc_job_will_run ( slurm_msg_t * msg ) ; + +inline static void slurm_rpc_allocate_resources ( slurm_msg_t * msg , uint8_t immediate ) ; + + int main (int argc, char *argv[]) { @@ -75,15 +78,18 @@ main (int argc, char *argv[]) init_time = time (NULL); log_init(argv[0], opts, SYSLOG_FACILITY_DAEMON, NULL); + fill_ctld_conf ( &slurmctld_conf ); + parse_commandline ( argc, argv, &slurmctld_conf ); + if ( ( error_code = init_slurm_conf () ) ) fatal ("slurmctld: init_slurm_conf error %d", error_code); - if ( ( error_code = read_slurm_conf (SLURM_CONF) ) ) + if ( ( error_code = read_slurm_conf ( slurmctld_conf.slurm_conf )) ) fatal ("slurmctld: error %d from read_slurm_conf reading %s", error_code, SLURM_CONF); if ( ( error_code = gethostname (node_name, MAX_NAME_LEN) ) ) fatal ("slurmctld: errno %d from gethostname", errno); - if ( strcmp (node_name, control_machine) && strcmp (node_name, backup_controller) ) + if ( strcmp (node_name, slurmctld_conf.control_machine) && strcmp (node_name, slurmctld_conf.backup_machine) ) fatal ("slurmctld: this machine (%s) is not the primary (%s) or backup (%s) controller", - node_name, control_machine, backup_controller); + node_name, slurmctld_conf.control_machine, slurmctld_conf.backup_machine); if ( ( sockfd = slurm_init_msg_engine_port ( SLURM_PORT ) ) == SLURM_SOCKET_ERROR ) @@ -147,11 +153,11 @@ slurmctld_req ( slurm_msg_t * msg ) slurm_free_last_update_msg ( msg -> data ) ; break; case REQUEST_RESOURCE_ALLOCATION : - slurm_rpc_allocate_resources ( msg ) ; + slurm_rpc_allocate_resources ( msg, false ) ; slurm_free_job_desc_msg ( msg -> data ) ; break; case REQUEST_IMMEDIATE_RESOURCE_ALLOCATION : - slurm_rpc_allocate_resources_immediately ( msg ) ; + slurm_rpc_allocate_resources ( msg, true ) ; slurm_free_job_desc_msg ( msg -> data ) ; break; case REQUEST_JOB_WILL_RUN : @@ -217,7 +223,7 @@ slurm_rpc_dump_build ( slurm_msg_t * msg ) else { /* success */ - fill_build_table ( & build_tbl ) ; + fill_ctld_conf ( & build_tbl ) ; /* init response_msg structure */ response_msg . address = msg -> address ; response_msg . msg_type = RESPONSE_BUILD_INFO ; @@ -477,20 +483,20 @@ slurm_rpc_submit_batch_job ( slurm_msg_t * msg ) } /* Allocate: allocate resources for a job */ -void slurm_rpc_allocate_resources ( slurm_msg_t * msg ) +void slurm_rpc_allocate_resources ( slurm_msg_t * msg , uint8_t immediate ) { /* init */ int error_code; + slurm_msg_t response_msg ; clock_t start_time; - uint32_t job_id ; job_desc_msg_t * job_desc_msg = ( job_desc_msg_t * ) msg-> data ; - char * node_name_ptr = NULL; + resource_allocation_response_msg_t * alloc_msg = xmalloc( sizeof( resource_allocation_response_msg_t ) ) ; start_time = clock (); /* do RPC call */ error_code = job_allocate(job_desc_msg, - &job_id, &node_name_ptr, false , false ); + &alloc_msg->job_id, &alloc_msg->node_list, immediate , false ); /* return result */ if (error_code) @@ -502,12 +508,16 @@ void slurm_rpc_allocate_resources ( slurm_msg_t * msg ) else { info ("slurmctld_req: allocated nodes %s, JobId=%u, time=%ld", - node_name_ptr, job_id, + alloc_msg->node_list, alloc_msg->job_id, (long) (clock () - start_time)); /* send job_ID and node_name_ptr */ + response_msg . msg_type = ( immediate ) ? RESPONSE_IMMEDIATE_RESOURCE_ALLOCATION : RESPONSE_RESOURCE_ALLOCATION ; + response_msg . data = & alloc_msg ; + slurm_send_controller_msg ( msg->conn_fd , & response_msg ) ; } - if (node_name_ptr) - xfree (node_name_ptr); + + if ( alloc_msg ) + xfree ( alloc_msg ); } /* JobWillRun - determine if job with given configuration can be initiated now */ @@ -542,36 +552,6 @@ void slurm_rpc_job_will_run ( slurm_msg_t * msg ) } -/* slurm_rpc_allocate_resources_immediately - test if job could initiated now */ -void slurm_rpc_allocate_resources_immediately ( slurm_msg_t * msg ) -{ - /* init */ - int error_code; - clock_t start_time; - uint32_t job_id ; - job_desc_msg_t * job_desc_msg = ( job_desc_msg_t * ) msg-> data ; - char * node_name_ptr = NULL; - - start_time = clock (); - - /* do RPC call */ - error_code = job_allocate(job_desc_msg, &job_id, &node_name_ptr, true , false ); - - /* return result */ - if (error_code) - { - info ("slurmctld_req: job_will_run error %d, time=%ld", - error_code, (long) (clock () - start_time)); - slurm_send_rc_msg ( msg , error_code ); - } - else - { - info ("slurmctld_req: job_will_run success for , time=%ld", - (long) (clock () - start_time)); - slurm_send_rc_msg ( msg , SLURM_SUCCESS ); - } - -} /* Reconfigure - re-initialized from configuration files */ void @@ -672,25 +652,85 @@ slurm_rpc_register_node_status ( slurm_msg_t * msg ) void -fill_build_table ( slurm_ctl_conf_t * build_ptr ) +init_ctld_conf ( slurm_ctl_conf_t * conf_ptr ) +{ + conf_ptr->last_update = init_time ; + conf_ptr->backup_interval = 0 ; + conf_ptr->backup_location = NULL ; + conf_ptr->backup_machine = NULL ; + conf_ptr->control_daemon = NULL ; + conf_ptr->control_machine = NULL ; + conf_ptr->controller_timeout = 0 ; + conf_ptr->epilog = NULL ; + conf_ptr->fast_schedule = 0 ; + conf_ptr->hash_base = 0 ; + conf_ptr->heartbeat_interval = 0; + conf_ptr->init_program = NULL ; + conf_ptr->kill_wait = 0 ; + conf_ptr->prioritize = NULL ; + conf_ptr->prolog = NULL ; + conf_ptr->server_daemon = NULL ; + conf_ptr->server_timeout = 0 ; + conf_ptr->slurm_conf = NULL ; + conf_ptr->tmp_fs = NULL ; +} + +void +fill_ctld_conf ( slurm_ctl_conf_t * conf_ptr ) { - build_ptr->last_update = init_time ; - build_ptr->backup_interval = BACKUP_INTERVAL ; - build_ptr->backup_location = BACKUP_LOCATION ; - build_ptr->backup_machine = backup_controller ; - build_ptr->control_daemon = CONTROL_DAEMON ; - build_ptr->control_machine = control_machine ; - build_ptr->controller_timeout = CONTROLLER_TIMEOUT ; - build_ptr->epilog = EPILOG ; - build_ptr->fast_schedule = FAST_SCHEDULE ; - build_ptr->hash_base = HASH_BASE ; - build_ptr->heartbeat_interval = HEARTBEAT_INTERVAL; - build_ptr->init_program = INIT_PROGRAM ; - build_ptr->kill_wait = KILL_WAIT ; - build_ptr->prioritize = PRIORITIZE ; - build_ptr->prolog = PROLOG ; - build_ptr->server_daemon = SERVER_DAEMON ; - build_ptr->server_timeout = SERVER_TIMEOUT ; - build_ptr->slurm_conf = SLURM_CONF ; - build_ptr->tmp_fs = TMP_FS ; + conf_ptr->last_update = init_time ; + if ( !conf_ptr->backup_interval ) conf_ptr->backup_interval = BACKUP_INTERVAL ; + if ( !conf_ptr->backup_location ) conf_ptr->backup_location = BACKUP_LOCATION ; + if ( !conf_ptr->backup_machine ) conf_ptr->backup_machine = backup_controller ; + if ( !conf_ptr->control_daemon ) conf_ptr->control_daemon = CONTROL_DAEMON ; + if ( !conf_ptr->control_machine ) conf_ptr->control_machine = control_machine ; + if ( !conf_ptr->controller_timeout ) conf_ptr->controller_timeout = CONTROLLER_TIMEOUT ; + if ( !conf_ptr->epilog ) conf_ptr->epilog = EPILOG ; + if ( !conf_ptr->fast_schedule ) conf_ptr->fast_schedule = FAST_SCHEDULE ; + if ( !conf_ptr->hash_base ) conf_ptr->hash_base = HASH_BASE ; + if ( !conf_ptr->heartbeat_interval ) conf_ptr->heartbeat_interval = HEARTBEAT_INTERVAL; + if ( !conf_ptr->init_program ) conf_ptr->init_program = INIT_PROGRAM ; + if ( !conf_ptr->kill_wait ) conf_ptr->kill_wait = KILL_WAIT ; + if ( !conf_ptr->prioritize ) conf_ptr->prioritize = PRIORITIZE ; + if ( !conf_ptr->prolog ) conf_ptr->prolog = PROLOG ; + if ( !conf_ptr->server_daemon ) conf_ptr->server_daemon = SERVER_DAEMON ; + if ( !conf_ptr->server_timeout ) conf_ptr->server_timeout = SERVER_TIMEOUT ; + if ( !conf_ptr->slurm_conf ) conf_ptr->slurm_conf = SLURM_CONF ; + if ( !conf_ptr->tmp_fs ) conf_ptr->tmp_fs = TMP_FS ; +} + + +/* Variables for commandline passing using getopt */ +extern char *optarg; +extern int optind, opterr, optopt; + +void +parse_commandline( int argc, char* argv[], slurm_ctl_conf_t * conf_ptr ) +{ + int c = 0; + opterr = 0; + + while ((c = getopt (argc, argv, "b:c:f:s")) != -1) + switch (c) + { + case 'b': + conf_ptr->backup_machine = optarg; + printf("backup_machine = %s\n", conf_ptr->backup_machine ); + break; + case 'c': + conf_ptr->control_machine = optarg; + printf("control_machine = %s\n", conf_ptr->control_machine ); + break; + case 'f': + slurmctld_conf.slurm_conf = optarg; + printf("slurmctrld.conf = %s\n", slurmctld_conf.slurm_conf ); + break; + case 's': + conf_ptr->fast_schedule = 1; + break; + default: + abort (); + } + + } diff --git a/src/slurmd/slurmd.c b/src/slurmd/slurmd.c index 18e6d5f94fc7aafedce232d07711539cc5b44dca..2024381c7e6a64e14f769c827ef0d885e458eef7 100644 --- a/src/slurmd/slurmd.c +++ b/src/slurmd/slurmd.c @@ -278,7 +278,7 @@ void slurm_rpc_kill_tasks ( slurm_msg_t * msg ) void slurm_rpc_slurmd_example ( slurm_msg_t * msg ) { /* init */ - int error_code; + int error_code = SLURM_SUCCESS; clock_t start_time; start_time = clock (); diff --git a/src/slurmd/task_mgr.c b/src/slurmd/task_mgr.c index 7b1ac5c4a3b6e456e320b5c72b80a598f98a2970..0bb6efacf92e0dca73bda402701abdb177a0c8d5 100644 --- a/src/slurmd/task_mgr.c +++ b/src/slurmd/task_mgr.c @@ -186,7 +186,7 @@ int append_task_to_list ( launch_tasks_msg_t * launch_msg , int pid ) int kill_tasks ( kill_tasks_msg_t * kill_task_msg ) { - int error_code ; + int error_code = SLURM_SUCCESS ; return error_code ; }