From 0e91187e9162c10c3cf2fb9bd8c7a4a6a0da5db7 Mon Sep 17 00:00:00 2001 From: Joakim Hove Date: Wed, 7 Mar 2018 17:41:48 +0100 Subject: [PATCH 1/8] White space fixup --- libjob_queue/src/local_driver.c | 54 ++++++++++++++++----------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/libjob_queue/src/local_driver.c b/libjob_queue/src/local_driver.c index 19d2471c81..faa394825c 100644 --- a/libjob_queue/src/local_driver.c +++ b/libjob_queue/src/local_driver.c @@ -2,18 +2,18 @@ Copyright (C) 2011 Statoil ASA, Norway. The file 'local_driver.c' is part of ERT - Ensemble based Reservoir Tool. - - ERT is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - ERT is distributed in the hope that it will be useful, but WITHOUT ANY - WARRANTY; without even the implied warranty of MERCHANTABILITY or - FITNESS FOR A PARTICULAR PURPOSE. - - See the GNU General Public License at - for more details. + + ERT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + ERT is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. + + See the GNU General Public License at + for more details. */ #include @@ -79,7 +79,7 @@ static void local_job_free(local_job_type * job) { job_status_type local_driver_get_job_status(void * __driver, void * __job) { - if (__job == NULL) + if (__job == NULL) /* The job has not been registered at all ... */ return JOB_QUEUE_NOT_ACTIVE; else { @@ -98,11 +98,11 @@ void local_driver_free_job( void * __job ) { void local_driver_kill_job( void * __driver , void * __job) { local_job_type * job = local_job_safe_cast( __job ); - + if (job->active) { - pthread_cancel( job->run_thread ); + pthread_cancel( job->run_thread ); } - + kill( job->child_process , SIGTERM ); } @@ -139,10 +139,10 @@ void * submit_job_thread__(void * __arg) { -void * local_driver_submit_job(void * __driver , - const char * submit_cmd , +void * local_driver_submit_job(void * __driver , + const char * submit_cmd , int num_cpu , /* Ignored */ - const char * run_path , + const char * run_path , const char * job_name , int argc , const char ** argv ) { @@ -155,14 +155,14 @@ void * local_driver_submit_job(void * __driver , arg_pack_append_int( arg_pack , argc ); arg_pack_append_ptr( arg_pack , util_alloc_stringlist_copy( argv , argc )); /* Due to conflict with threads and python GC we take a local copy. */ arg_pack_append_ptr( arg_pack , job ); - + pthread_mutex_lock( &driver->submit_lock ); job->active = true; job->status = JOB_QUEUE_RUNNING; - - if (pthread_create( &job->run_thread , &driver->thread_attr , submit_job_thread__ , arg_pack) != 0) + + if (pthread_create( &job->run_thread , &driver->thread_attr , submit_job_thread__ , arg_pack) != 0) util_abort("%s: failed to create run thread - aborting \n",__func__); - + pthread_mutex_unlock( &driver->submit_lock ); return job; } @@ -189,12 +189,12 @@ void * local_driver_alloc() { pthread_mutex_init( &local_driver->submit_lock , NULL ); pthread_attr_init( &local_driver->thread_attr ); pthread_attr_setdetachstate( &local_driver->thread_attr , PTHREAD_CREATE_DETACHED ); - + return local_driver; } -bool local_driver_set_option( void * __driver , const char * option_key , const void * value){ +bool local_driver_set_option( void * __driver , const char * option_key , const void * value){ return false; } @@ -202,8 +202,8 @@ void local_driver_init_option_list(stringlist_type * option_list) { //No options specific for local driver; do nothing } -#undef LOCAL_DRIVER_ID -#undef LOCAL_JOB_ID +#undef LOCAL_DRIVER_ID +#undef LOCAL_JOB_ID /*****************************************************************/ From b9c8395e5e90c8936a88d55c2761548cd5d4469a Mon Sep 17 00:00:00 2001 From: Joakim Hove Date: Thu, 8 Mar 2018 01:26:13 +0100 Subject: [PATCH 2/8] job_queue_manager will wait in stop function. --- libjob_queue/include/ert/job_queue/job_queue_manager.h | 2 +- libjob_queue/src/job_queue_manager.c | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/libjob_queue/include/ert/job_queue/job_queue_manager.h b/libjob_queue/include/ert/job_queue/job_queue_manager.h index 7fe82892d0..fa0026f517 100644 --- a/libjob_queue/include/ert/job_queue/job_queue_manager.h +++ b/libjob_queue/include/ert/job_queue/job_queue_manager.h @@ -31,7 +31,7 @@ typedef struct job_queue_manager_struct job_queue_manager_type; job_queue_manager_type * job_queue_manager_alloc( job_queue_type * job_queue ); void job_queue_manager_free( job_queue_manager_type * manager ); void job_queue_manager_start_queue( job_queue_manager_type * manager , int num_total_run , bool verbose); - void job_queue_manager_stop_queue(const job_queue_manager_type * manager); + void job_queue_manager_stop_queue(job_queue_manager_type * manager); bool job_queue_manager_try_wait( job_queue_manager_type * manager , int timeout_seconds); void job_queue_manager_wait( job_queue_manager_type * manager); int job_queue_manager_get_num_running( const job_queue_manager_type * manager); diff --git a/libjob_queue/src/job_queue_manager.c b/libjob_queue/src/job_queue_manager.c index 34f9d27854..58519bef16 100644 --- a/libjob_queue/src/job_queue_manager.c +++ b/libjob_queue/src/job_queue_manager.c @@ -184,9 +184,11 @@ job_status_type job_queue_manager_iget_job_status(const job_queue_manager_type * } -void job_queue_manager_stop_queue(const job_queue_manager_type * manager) { +void job_queue_manager_stop_queue(job_queue_manager_type * manager) { job_queue_start_user_exit(manager->job_queue); while(job_queue_is_running(manager->job_queue)) usleep(100000); + + job_queue_manager_wait(manager); } From 5ff887c3455572f5231f71e57308cbee27d4c29e Mon Sep 17 00:00:00 2001 From: Joakim Hove Date: Thu, 8 Mar 2018 01:27:23 +0100 Subject: [PATCH 3/8] Changed NULL comparison formatting. --- libjob_queue/src/job_node.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libjob_queue/src/job_node.c b/libjob_queue/src/job_node.c index 1121b552b1..5214ebc87e 100644 --- a/libjob_queue/src/job_node.c +++ b/libjob_queue/src/job_node.c @@ -194,7 +194,7 @@ void job_queue_node_free_data(job_queue_node_type * node) { util_safe_free( node->run_cmd ); util_free_stringlist( node->argv , node->argc ); - if (node->job_data != NULL) + if (node->job_data) util_abort("%s: internal error - driver spesific job data has not been freed - will leak.\n",__func__); } @@ -593,7 +593,7 @@ bool job_queue_node_kill( job_queue_node_type * node , job_queue_status_type * s void job_queue_node_free_driver_data( job_queue_node_type * node , queue_driver_type * driver) { pthread_mutex_lock( &node->data_mutex ); { - if (node->job_data != NULL) + if (node->job_data) queue_driver_free_job( driver , node->job_data ); node->job_data = NULL; } From 5df1697d5c33e10fff6ecca1986f77bd093106ab Mon Sep 17 00:00:00 2001 From: Joakim Hove Date: Thu, 8 Mar 2018 01:31:30 +0100 Subject: [PATCH 4/8] Removed pthread_cancel() and pthread_exit() calls. --- libjob_queue/src/local_driver.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/libjob_queue/src/local_driver.c b/libjob_queue/src/local_driver.c index faa394825c..fe433570a9 100644 --- a/libjob_queue/src/local_driver.c +++ b/libjob_queue/src/local_driver.c @@ -98,11 +98,6 @@ void local_driver_free_job( void * __job ) { void local_driver_kill_job( void * __driver , void * __job) { local_job_type * job = local_job_safe_cast( __job ); - - if (job->active) { - pthread_cancel( job->run_thread ); - } - kill( job->child_process , SIGTERM ); } @@ -133,7 +128,6 @@ void * submit_job_thread__(void * __arg) { job->status = WIFEXITED(wait_status) ? JOB_QUEUE_DONE : JOB_QUEUE_IS_KILLED; } } - pthread_exit(NULL); return NULL; } From 1d19e584c8c70eb3a2382061bf899fe50ac7d8ff Mon Sep 17 00:00:00 2001 From: Joakim Hove Date: Thu, 8 Mar 2018 01:32:17 +0100 Subject: [PATCH 5/8] Changed local driver exit status to JOB_QUEUE_EXIT --- libjob_queue/src/local_driver.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/libjob_queue/src/local_driver.c b/libjob_queue/src/local_driver.c index fe433570a9..f74c5495c1 100644 --- a/libjob_queue/src/local_driver.c +++ b/libjob_queue/src/local_driver.c @@ -121,12 +121,12 @@ void * submit_job_thread__(void * __arg) { arg_pack_free(arg_pack); waitpid(job->child_process, &wait_status, 0); - if(job->free_when_finished) // A request to free the job sent while its thread was still running - free(job); - else { - job->active = false; - job->status = WIFEXITED(wait_status) ? JOB_QUEUE_DONE : JOB_QUEUE_IS_KILLED; - } + job->active = false; + job->status = JOB_QUEUE_EXIT; + if (WIFEXITED(wait_status)) + if (WEXITSTATUS(wait_status) == 0) + job->status = JOB_QUEUE_DONE; + } return NULL; } From e4e3b372ab952e352469e18696ad0113c4185a4f Mon Sep 17 00:00:00 2001 From: Joakim Hove Date: Thu, 8 Mar 2018 10:07:54 +0100 Subject: [PATCH 6/8] Extracted function for calling handlers. --- libjob_queue/src/job_queue.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/libjob_queue/src/job_queue.c b/libjob_queue/src/job_queue.c index 5620c25362..c13ae53b61 100644 --- a/libjob_queue/src/job_queue.c +++ b/libjob_queue/src/job_queue.c @@ -851,6 +851,11 @@ static bool submit_new_jobs(job_queue_type * queue) { } } + return new_jobs; +} + + +static void run_handlers(job_queue_type * queue) { /* Checking for complete / exited / overtime jobs */ @@ -874,7 +879,7 @@ static bool submit_new_jobs(job_queue_type * queue) { break; } } - return new_jobs; + } @@ -946,6 +951,7 @@ static void job_queue_loop(job_queue_type * queue, int num_total_run, bool verbo if (!complete) { new_jobs = submit_new_jobs(queue); + run_handlers(queue); } else /* print an updated status to stdout before exiting. */ if (verbose) From d375db82d27b34cadbdc3272b8837387e3526eb1 Mon Sep 17 00:00:00 2001 From: Joakim Hove Date: Thu, 8 Mar 2018 10:10:47 +0100 Subject: [PATCH 7/8] Removed non fuctional free-when-finished code. --- libjob_queue/src/local_driver.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/libjob_queue/src/local_driver.c b/libjob_queue/src/local_driver.c index f74c5495c1..cc493585cd 100644 --- a/libjob_queue/src/local_driver.c +++ b/libjob_queue/src/local_driver.c @@ -36,7 +36,6 @@ typedef struct local_job_struct local_job_type; struct local_job_struct { UTIL_TYPE_ID_DECLARATION; bool active; - bool free_when_finished; job_status_type status; pthread_t run_thread; pid_t child_process; @@ -64,18 +63,10 @@ static local_job_type * local_job_alloc() { job = util_malloc(sizeof * job ); UTIL_TYPE_ID_INIT( job , LOCAL_JOB_TYPE_ID ); job->active = false; - job->free_when_finished = false; job->status = JOB_QUEUE_WAITING; return job; } -static void local_job_free(local_job_type * job) { - if (job->active) - job->free_when_finished = true; - else - free(job); -} - job_status_type local_driver_get_job_status(void * __driver, void * __job) { @@ -92,7 +83,8 @@ job_status_type local_driver_get_job_status(void * __driver, void * __job) { void local_driver_free_job( void * __job ) { local_job_type * job = local_job_safe_cast( __job ); - local_job_free(job); + if (!job->active) + free(job); } @@ -102,6 +94,13 @@ void local_driver_kill_job( void * __driver , void * __job) { } + +/* + This function needs to dereference the job pointer after the waitpid() call is + complete, it is therefor essential that no other threads have called free(job) + while the external process is running. +*/ + void * submit_job_thread__(void * __arg) { arg_pack_type *arg_pack = arg_pack_safe_cast(__arg); const char *executable = arg_pack_iget_const_ptr(arg_pack, 0); From d3690a86d51933e35b8584e679ecbc037e3e671a Mon Sep 17 00:00:00 2001 From: Joakim Hove Date: Thu, 8 Mar 2018 10:11:26 +0100 Subject: [PATCH 8/8] Removed hard check on node->job_data == NULL. In situations where the local driver has killed jobs this requirement can not be enforced, in that situation the local_job objects of the killed jobs will leak. --- libjob_queue/src/job_node.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/libjob_queue/src/job_node.c b/libjob_queue/src/job_node.c index 5214ebc87e..456f598bfc 100644 --- a/libjob_queue/src/job_node.c +++ b/libjob_queue/src/job_node.c @@ -193,9 +193,6 @@ void job_queue_node_free_data(job_queue_node_type * node) { util_safe_free( node->status_file ); util_safe_free( node->run_cmd ); util_free_stringlist( node->argv , node->argc ); - - if (node->job_data) - util_abort("%s: internal error - driver spesific job data has not been freed - will leak.\n",__func__); }