diff --git a/core/federated/RTI/CMakeLists.txt b/core/federated/RTI/CMakeLists.txt index 75d812ab7..73b1b0d4e 100644 --- a/core/federated/RTI/CMakeLists.txt +++ b/core/federated/RTI/CMakeLists.txt @@ -73,7 +73,6 @@ add_executable( ${CoreLib}/utils/pqueue_base.c ${CoreLib}/utils/pqueue_tag.c ${CoreLib}/utils/pqueue.c - message_record/message_record.c ) IF(CMAKE_BUILD_TYPE MATCHES DEBUG) diff --git a/core/federated/RTI/main.c b/core/federated/RTI/main.c index fdc234ced..700304aea 100644 --- a/core/federated/RTI/main.c +++ b/core/federated/RTI/main.c @@ -48,6 +48,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "rti_remote.h" +#include "net_util.h" #include // To trap ctrl-c and invoke a clean stop to save the trace file, if needed. #include @@ -67,16 +68,50 @@ static rti_remote_t rti; */ const char *rti_trace_file_name = "rti.lft"; +/** Indicator that normal termination of the RTI has occurred. */ +bool normal_termination = false; + /** - * @brief A clean termination of the RTI will write the trace file, if tracing is - * enabled, before exiting. + * Send a failed signal to the specified federate. */ -void termination() { +static void send_failed_signal(federate_info_t* fed) { + size_t bytes_to_write = 1; + unsigned char buffer[bytes_to_write]; + buffer[0] = MSG_TYPE_FAILED; if (rti.base.tracing_enabled) { - stop_trace(rti.base.trace); - lf_print("RTI trace file saved."); + tracepoint_rti_to_federate(rti.base.trace, send_FAILED, fed->enclave.id, NULL); + } + int failed = write_to_socket(fed->socket, bytes_to_write, &(buffer[0])); + if (failed == 0) { + LF_PRINT_LOG("RTI has sent failed signal to federate %d due to abnormal termination.", fed->enclave.id); + } else { + lf_print_error("RTI failed to send failed signal to federate %d on socket ID %d.", fed->enclave.id, fed->socket); + } +} + +/** + * @brief Function to run upon termination. + * This function will be invoked both after main() returns and when a signal + * that results in terminating the process, such as SIGINT. In the former + * case, it should do nothing. In the latter case, it will send a MSG_TYPE_FAILED + * signal to each federate and attempt to write the trace file, but without + * acquiring a mutex lock, so the resulting files may be incomplete or even + * corrupted. But this is better than just failing to write the data we have + * collected so far. + */ +void termination() { + if (!normal_termination) { + for (int i = 0; i < rti.base.number_of_scheduling_nodes; i++) { + federate_info_t *f = (federate_info_t*)rti.base.scheduling_nodes[i]; + if (!f || f->enclave.state == NOT_CONNECTED) continue; + send_failed_signal(f); + } + if (rti.base.tracing_enabled) { + stop_trace_locked(rti.base.trace); + lf_print("RTI trace file saved."); + } + lf_print("RTI is exiting abnormally."); } - lf_print("RTI is exiting."); } void usage(int argc, const char* argv[]) { @@ -86,7 +121,7 @@ void usage(int argc, const char* argv[]) { lf_print(" -n, --number_of_federates "); lf_print(" The number of federates in the federation that this RTI will control.\n"); lf_print(" -p, --port "); - lf_print(" The port number to use for the RTI. Must be larger than 0 and smaller than %d. Default is %d.\n", UINT16_MAX, STARTING_PORT); + lf_print(" The port number to use for the RTI. Must be larger than 0 and smaller than %d. Default is %d.\n", UINT16_MAX, DEFAULT_PORT); lf_print(" -c, --clock_sync [off|init|on] [period ] [exchanges-per-interval ]"); lf_print(" The status of clock synchronization for this federate."); lf_print(" - off: Clock synchronization is off."); @@ -254,6 +289,16 @@ int main(int argc, const char* argv[]) { // Catch the Ctrl-C signal, for a clean exit that does not lose the trace information signal(SIGINT, exit); +#ifdef SIGPIPE + // Ignore SIGPIPE errors, which terminate the entire application if + // socket write() fails because the reader has closed the socket. + // Instead, cause an EPIPE error to be set when write() fails. + // NOTE: The reason for a broken socket causing a SIGPIPE signal + // instead of just having write() return an error is to robutly + // a foo | bar pipeline where bar crashes. The default behavior + // is for foo to also exit. + signal(SIGPIPE, SIG_IGN); +#endif // SIGPIPE if (atexit(termination) != 0) { lf_print_warning("Failed to register termination function!"); } @@ -277,16 +322,28 @@ int main(int argc, const char* argv[]) { // Allocate memory for the federates rti.base.scheduling_nodes = (scheduling_node_t**)calloc(rti.base.number_of_scheduling_nodes, sizeof(scheduling_node_t*)); for (uint16_t i = 0; i < rti.base.number_of_scheduling_nodes; i++) { - federate_info_t *fed_info = (federate_info_t *) malloc(sizeof(federate_info_t)); + federate_info_t *fed_info = (federate_info_t *) calloc(1, sizeof(federate_info_t)); initialize_federate(fed_info, i); rti.base.scheduling_nodes[i] = (scheduling_node_t *) fed_info; } int socket_descriptor = start_rti_server(rti.user_specified_port); - wait_for_federates(socket_descriptor); + if (socket_descriptor >= 0) { + wait_for_federates(socket_descriptor); + normal_termination = true; + if (rti.base.tracing_enabled) { + // No need for a mutex lock because all threads have exited. + stop_trace_locked(rti.base.trace); + lf_print("RTI trace file saved."); + } + } + + lf_print("RTI is exiting."); // Do this before freeing scheduling nodes. free_scheduling_nodes(rti.base.scheduling_nodes, rti.base.number_of_scheduling_nodes); - lf_print("RTI is exiting."); - return 0; + + // Even if the RTI is exiting normally, it should report an error code if one of the + // federates has reported an error. + return (int)_lf_federate_reports_error; } #endif // STANDALONE_RTI diff --git a/core/federated/RTI/message_record/message_record.c b/core/federated/RTI/message_record/message_record.c deleted file mode 100644 index bbea99b9b..000000000 --- a/core/federated/RTI/message_record/message_record.c +++ /dev/null @@ -1,176 +0,0 @@ -#if defined STANDALONE_RTI -/** - * @file message_record.c - * @author Soroush Bateni (soroush@berkeley.edu) - * @brief Record-keeping for in-transit messages. - * @version 0.1 - * @date 2022-06-02 - * - * @copyright Copyright (c) 2022, The University of California at Berkeley. - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY -EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL -THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF -THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -***************/ - -#include "message_record.h" -#include "platform.h" -#include - -/** - * @brief Initialize the in-transit message record queue. - * - * @return in_transit_message_record_q - */ -in_transit_message_record_q_t* initialize_in_transit_message_q() { - in_transit_message_record_q_t* queue = - (in_transit_message_record_q_t*)calloc( - 1, - sizeof(in_transit_message_record_q_t) - ); - queue->main_queue = pqueue_init( - 10, - in_reverse_order, - get_message_record_index, - get_message_record_position, - set_message_record_position, - tags_match, - print_message_record - ); - - queue->transfer_queue = pqueue_init( - 10, - in_reverse_order, - get_message_record_index, - get_message_record_position, - set_message_record_position, - tags_match, - print_message_record - ); - - return queue; -} - -/** - * @brief Free the memory occupied by the `queue`. - * - * @param queue The queue to free. - */ -void free_in_transit_message_q(in_transit_message_record_q_t* queue) { - pqueue_free(queue->main_queue); - pqueue_free(queue->transfer_queue); - free(queue); -} - -/** - * @brief Add a record of the in-transit message. - * - * @param queue The queue to add to. - * @param tag The tag of the in-transit message. - * @return 0 on success. - */ -int add_in_transit_message_record(in_transit_message_record_q_t* queue, tag_t tag) { - in_transit_message_record_t* in_transit_record = malloc(sizeof(in_transit_message_record_t)); - in_transit_record->tag = tag; - return pqueue_insert( - queue->main_queue, - (void*)in_transit_record - ); -} - -/** - * @brief Clean the record of in-transit messages up to and including `tag`. - * - * @param queue The queue to clean. - * @param tag Will clean all messages with tags <= tag. - */ -void clean_in_transit_message_record_up_to_tag(in_transit_message_record_q_t* queue, tag_t tag) { - in_transit_message_record_t* head_of_in_transit_messages = (in_transit_message_record_t*)pqueue_peek(queue->main_queue); - while ( - head_of_in_transit_messages != NULL && // Queue is not empty - head_of_in_transit_messages->tag.time <= tag.time // The head message record has a time less than or equal to - // `tag.time`. - ) { - // Now compare the tags. The message record queue is ordered according to the `time` field, so we need to check - // all records with that `time` and find those that have smaller or equal full tags. - if (lf_tag_compare( - head_of_in_transit_messages->tag, - tag - ) <= 0 - ) { - LF_PRINT_DEBUG( - "RTI: Removed a message with tag (" PRINTF_TIME ", %u) from the list of in-transit messages.", - head_of_in_transit_messages->tag.time - lf_time_start(), - head_of_in_transit_messages->tag.microstep - ); - - free(pqueue_pop(queue->main_queue)); - } else { - // Add it to the transfer queue - pqueue_insert(queue->transfer_queue, pqueue_pop(queue->main_queue)); - } - head_of_in_transit_messages = (in_transit_message_record_t*)pqueue_peek(queue->main_queue); - } - // Empty the transfer queue (which holds messages with equal time but larger microstep) into the main queue. - pqueue_empty_into(&queue->main_queue, &queue->transfer_queue); -} - -/** - * @brief Get the minimum tag of all currently recorded in-transit messages. - * - * @param queue The queue to search in (of type `in_transit_message_record_q`). - * @return tag_t The minimum tag of all currently recorded in-transit messages. Return `FOREVER_TAG` if the queue is empty. - */ -tag_t get_minimum_in_transit_message_tag(in_transit_message_record_q_t* queue) { - tag_t minimum_tag = FOREVER_TAG; - - in_transit_message_record_t* head_of_in_transit_messages = (in_transit_message_record_t*)pqueue_peek(queue->main_queue); - while (head_of_in_transit_messages != NULL) { // Queue is not empty - // The message record queue is ordered according to the `time` field, so we need to check - // all records with the minimum `time` and find those that have the smallest tag. - if (lf_tag_compare( - head_of_in_transit_messages->tag, - minimum_tag - ) <= 0 - ) { - minimum_tag = head_of_in_transit_messages->tag; - } else if (head_of_in_transit_messages->tag.time > minimum_tag.time) { - break; - } - - // Add the head to the transfer queue. - pqueue_insert(queue->transfer_queue, pqueue_pop(queue->main_queue)); - - head_of_in_transit_messages = (in_transit_message_record_t*)pqueue_peek(queue->main_queue); - } - // Empty the transfer queue (which holds messages with equal time but larger microstep) into the main queue. - pqueue_empty_into(&queue->main_queue, &queue->transfer_queue); - - if (head_of_in_transit_messages != NULL) { - LF_PRINT_DEBUG( - "RTI: Minimum tag of all in-transit messages: " PRINTF_TAG, - head_of_in_transit_messages->tag.time - lf_time_start(), - head_of_in_transit_messages->tag.microstep - ); - } - - return minimum_tag; -} - -#endif // STANDALONE_RTI diff --git a/core/federated/RTI/message_record/message_record.h b/core/federated/RTI/message_record/message_record.h deleted file mode 100644 index d57f81f64..000000000 --- a/core/federated/RTI/message_record/message_record.h +++ /dev/null @@ -1,86 +0,0 @@ -#if defined STANDALONE_RTI -/** - * @file message_record.h - * @author Soroush Bateni (soroush@berkeley.edu) - * @brief Record-keeping for in-transit messages. - * @version 0.1 - * @date 2022-06-02 - * - * @copyright Copyright (c) 2022, The University of California at Berkeley. - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY -EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL -THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF -THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -***************/ - -#ifndef RTI_MESSAGE_RECORD_H -#define RTI_MESSAGE_RECORD_H - -#include "rti_pqueue_support.h" - -/** - * @brief Queue to keep a record of in-transit messages. - * - */ -typedef struct { - pqueue_t* main_queue; // The primary queue. - pqueue_t* transfer_queue; // Queue used for housekeeping. -} in_transit_message_record_q_t; - -/** - * @brief Initialize the in-transit message record queue. - * - * @return in_transit_message_record_q - */ -in_transit_message_record_q_t* initialize_in_transit_message_q(); - -/** - * @brief Free the memory occupied by the `queue`. - * - * @param queue The queue to free. - */ -void free_in_transit_message_q(in_transit_message_record_q_t* queue); - -/** - * @brief Add a record of the in-transit message. - * - * @param queue The queue to add to (of type `in_transit_message_record_q`). - * @param tag The tag of the in-transit message. - * @return 0 on success. - */ -int add_in_transit_message_record(in_transit_message_record_q_t* queue, tag_t tag); - -/** - * @brief Clean the record of in-transit messages up to and including `tag`. - * - * @param queue The queue to clean (of type `in_transit_message_record_q`). - * @param tag Will clean all messages with tags <= tag. - */ -void clean_in_transit_message_record_up_to_tag(in_transit_message_record_q_t* queue, tag_t tag); - -/** - * @brief Get the minimum tag of all currently recorded in-transit messages. - * - * @param queue The queue to search in (of type `in_transit_message_record_q`). - * @return tag_t The minimum tag of all currently recorded in-transit messages. Return `FOREVER_TAG` if the queue is empty. - */ -tag_t get_minimum_in_transit_message_tag(in_transit_message_record_q_t* queue); - -#endif // RTI_MESSAGE_RECORD_H -#endif // STANDALONE_RTI diff --git a/core/federated/RTI/message_record/rti_pqueue_support.h b/core/federated/RTI/message_record/rti_pqueue_support.h deleted file mode 100644 index 09a35183a..000000000 --- a/core/federated/RTI/message_record/rti_pqueue_support.h +++ /dev/null @@ -1,101 +0,0 @@ -#if defined STANDALONE_RTI -/** - * @file rti_pqueue_support.h - * @author Soroush Bateni (soroush@berkeley.edu) - * @brief Header-only support functions for pqueue (in the RTI). - * @version 0.1 - * @date 2022-06-02 - * - * @copyright Copyright (c) 2022, The University of California at Berkeley. - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY -EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL -THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF -THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -***************/ - -#ifndef RTI_PQUEUE_SUPPORT_H -#define RTI_PQUEUE_SUPPORT_H - -#include "tag.h" -#include "utils/pqueue.h" -#include "utils/util.h" -#include "platform.h" - -// ********** Priority Queue Support Start -/** - * @brief Represent an in-transit message. - * - */ -typedef struct in_transit_message_record { - tag_t tag; // Tag of the in-transit message. - size_t pos; // Position in the priority queue. -} in_transit_message_record_t; - -/** - * Return whether or not the given `in_transit_message_record_t` types have the same tag. - */ -static int tags_match(void* next, void* curr) { - return (lf_tag_compare( - ((in_transit_message_record_t*)next)->tag, - ((in_transit_message_record_t*)curr)->tag - ) == 0); -} - -/** - * Report a priority equal to the time of the given in-transit message. - * Used for sorting pointers to in_transit_message_record_t structs. - */ -static pqueue_pri_t get_message_record_index(void *a) { - return (pqueue_pri_t)(((in_transit_message_record_t*) a)->tag.time); -} - -/** - * Return the given in_transit_message_record_t's position in the queue. - */ -static size_t get_message_record_position(void *a) { - return ((in_transit_message_record_t*) a)->pos; -} - -/** - * Set the given in_transit_message_record_t's position in the queue. - */ -static void set_message_record_position(void *a, size_t pos) { - ((in_transit_message_record_t*) a)->pos = pos; -} - -/** - * Print some information about the given in-transit message. - * - * DEBUG function only. - */ -static void print_message_record(void *message) { - in_transit_message_record_t *r = (in_transit_message_record_t*)message; - LF_PRINT_DEBUG( - "Tag of the in_transit_message_record_t: (" PRINTF_TIME ", %u). " - "Its position in the priority queue: %zu", - r->tag.time - lf_time_start(), - r->tag.microstep, - r->pos - ); -} - -// ********** Priority Queue Support End -#endif - -#endif // STANDALONE_RTI diff --git a/core/federated/RTI/rti_common.c b/core/federated/RTI/rti_common.c index a6554195e..33049db50 100644 --- a/core/federated/RTI/rti_common.c +++ b/core/federated/RTI/rti_common.c @@ -26,8 +26,6 @@ void initialize_rti_common(rti_common_t * _rti_common) { rti_common->num_scheduling_nodes_handling_stop = 0; } -// FIXME: For log and debug message in this file, what sould be kept: 'enclave', -// 'federate', or 'enlcave/federate'? Currently its is 'enclave/federate'. // FIXME: Should scheduling_nodes tracing use the same mechanism as federates? // It needs to account a federate having itself a number of scheduling_nodes. // Currently, all calls to tracepoint_from_federate() and @@ -66,7 +64,7 @@ void _logical_tag_complete(scheduling_node_t* enclave, tag_t completed) { enclave->completed = completed; - LF_PRINT_LOG("RTI received from federate/enclave %d the Logical Tag Complete (LTC) " PRINTF_TAG ".", + LF_PRINT_LOG("RTI received from federate/enclave %d the latest tag complete (LTC) " PRINTF_TAG ".", enclave->id, enclave->completed.time - start_time, enclave->completed.microstep); // Check downstream scheduling_nodes to see whether they should now be granted a TAG. @@ -101,11 +99,53 @@ tag_t earliest_future_incoming_message_tag(scheduling_node_t* e) { tag_t start_tag = {.time = start_time, .microstep = 0}; upstream->next_event = start_tag; } + // The min_delay here is a tag_t, not an interval_t because it may account for more than + // one connection. No delay at all is represented by (0,0). A delay of 0 is represented + // by (0,1). If the time part of the delay is greater than 0, then we want to ignore + // the microstep in upstream->next_event because that microstep will have been lost. + // Otherwise, we want preserve it and add to it. This is handled by lf_tag_add(). tag_t earliest_tag_from_upstream = lf_tag_add(upstream->next_event, e->min_delays[i].min_delay); + + /* Following debug message is too verbose for normal use: LF_PRINT_DEBUG("RTI: Earliest next event upstream of fed/encl %d at fed/encl %d has tag " PRINTF_TAG ".", e->id, upstream->id, earliest_tag_from_upstream.time - start_time, earliest_tag_from_upstream.microstep); + */ + if (lf_tag_compare(earliest_tag_from_upstream, t_d) < 0) { + t_d = earliest_tag_from_upstream; + } + } + return t_d; +} + +tag_t eimt_strict(scheduling_node_t* e) { + // Find the tag of the earliest possible incoming message from immediately upstream + // enclaves or federates that are not part of a zero-delay cycle. + // This will be the smallest upstream NET plus the least delay. + // This could be NEVER_TAG if the RTI has not seen a NET from some upstream node. + tag_t t_d = FOREVER_TAG; + for (int i = 0; i < e->num_upstream; i++) { + scheduling_node_t* upstream = rti_common->scheduling_nodes[e->upstream[i]]; + // Skip this node if it is part of a zero-delay cycle. + if (is_in_zero_delay_cycle(upstream)) continue; + // If we haven't heard from the upstream node, then assume it can send an event at the start time. + if (lf_tag_compare(upstream->next_event, NEVER_TAG) == 0) { + tag_t start_tag = {.time = start_time, .microstep = 0}; + upstream->next_event = start_tag; + } + // Need to consider nodes that are upstream of the upstream node because those + // nodes may send messages to the upstream node. + tag_t earliest = earliest_future_incoming_message_tag(upstream); + // If the next event of the upstream node is earlier, then use that. + if (lf_tag_compare(upstream->next_event, earliest) < 0) { + earliest = upstream->next_event; + } + tag_t earliest_tag_from_upstream = lf_delay_tag(earliest, e->upstream_delay[i]); + LF_PRINT_DEBUG("RTI: Strict EIMT of fed/encl %d at fed/encl %d has tag " PRINTF_TAG ".", + e->id, + upstream->id, + earliest_tag_from_upstream.time - start_time, earliest_tag_from_upstream.microstep); if (lf_tag_compare(earliest_tag_from_upstream, t_d) < 0) { t_d = earliest_tag_from_upstream; } @@ -152,24 +192,26 @@ tag_advance_grant_t tag_advance_grant_if_safe(scheduling_node_t* e) { // Find the tag of the earliest event that may be later received from an upstream enclave // or federate (which includes any after delays on the connections). tag_t t_d = earliest_future_incoming_message_tag(e); + // Non-ZDC version of the above. This is a tag that must be strictly greater than + // that of the next granted PTAG. + tag_t t_d_strict = eimt_strict(e); LF_PRINT_LOG("RTI: Earliest next event upstream of node %d has tag " PRINTF_TAG ".", e->id, t_d.time - start_time, t_d.microstep); // Given an EIMT (earliest incoming message tag) there are these possible scenarios: // 1) The EIMT is greater than the NET we want to advance to. Grant a TAG. - // 2) The EIMT is equal to the NET and the federate is part of a zero-delay cycle (ZDC). - // 3) The EIMT is equal to the NET and the federate is not part of a ZDC. - // 4) The EIMT is less than the NET - // In (1) we can give a TAG to NET. In (2) we can give a PTAG. - // In (3) and (4), we wait for further updates from upstream federates. + // 2) The EIMT is equal to the NET and the strict EIMT is greater than the net + // and the federate is part of a zero-delay cycle (ZDC). Grant a PTAG. + // 3) Otherwise, grant nothing and wait for further updates. if ( // Scenario (1) above lf_tag_compare(t_d, e->next_event) > 0 // EIMT greater than NET + && lf_tag_compare(e->next_event, NEVER_TAG) > 0 // NET is not NEVER_TAG && lf_tag_compare(t_d, e->last_provisionally_granted) >= 0 // The grant is not redundant - // (equal is important to override any previous - // PTAGs). - && lf_tag_compare(t_d, e->last_granted) > 0 // The grant is not redundant. + // (equal is important to override any previous + // PTAGs). + && lf_tag_compare(t_d, e->last_granted) > 0 // The grant is not redundant. ) { // No upstream node can send events that will be received with a tag less than or equal to // e->next_event, so it is safe to send a TAG. @@ -180,9 +222,10 @@ tag_advance_grant_t tag_advance_grant_if_safe(scheduling_node_t* e) { e->next_event.time - lf_time_start(), e->next_event.microstep); result.tag = e->next_event; - } else if( // Scenario (2) or (3) above + } else if( // Scenario (2) above lf_tag_compare(t_d, e->next_event) == 0 // EIMT equal to NET && is_in_zero_delay_cycle(e) // The node is part of a ZDC + && lf_tag_compare(t_d_strict, e->next_event) > 0 // The strict EIMT is greater than the NET && lf_tag_compare(t_d, e->last_provisionally_granted) > 0 // The grant is not redundant && lf_tag_compare(t_d, e->last_granted) > 0 // The grant is not redundant. ) { @@ -250,7 +293,11 @@ void notify_advance_grant_if_safe(scheduling_node_t* e) { // Local function used recursively to find minimum delays upstream. // Return in count the number of non-FOREVER_TAG entries in path_delays[]. -static void _update_min_delays_upstream(scheduling_node_t* end, scheduling_node_t* intermediate, tag_t path_delays[], size_t* count) { +static void _update_min_delays_upstream( + scheduling_node_t* end, + scheduling_node_t* intermediate, + tag_t path_delays[], + size_t* count) { // On first call, intermediate will be NULL, so the path delay is initialized to zero. tag_t delay_from_intermediate_so_far = ZERO_TAG; if (intermediate == NULL) { @@ -317,8 +364,8 @@ void update_min_delays_upstream(scheduling_node_t* node) { // Put the results onto the node's struct. node->num_min_delays = count; - node->min_delays = (minimum_delay_t*)malloc(count * sizeof(minimum_delay_t)); - LF_PRINT_DEBUG("++++ Node %hu(is in ZDC: %d\n", node->id, node->flags & IS_IN_ZERO_DELAY_CYCLE); + node->min_delays = (minimum_delay_t*)calloc(count, sizeof(minimum_delay_t)); + LF_PRINT_DEBUG("++++ Node %hu is in ZDC: %d", node->id, is_in_zero_delay_cycle(node)); int k = 0; for (int i = 0; i < rti_common->number_of_scheduling_nodes; i++) { if (lf_tag_compare(path_delays[i], FOREVER_TAG) < 0) { diff --git a/core/federated/RTI/rti_common.h b/core/federated/RTI/rti_common.h index d71751a98..770918d5b 100644 --- a/core/federated/RTI/rti_common.h +++ b/core/federated/RTI/rti_common.h @@ -6,9 +6,8 @@ * @author Chadlia Jerad (chadlia.jerad@ensi-uma.tn) * @copyright (c) 2020-2023, The University of California at Berkeley * License in [BSD 2-clause](https://github.com/lf-lang/reactor-c/blob/main/LICENSE.md) - * @brief Common declarations for runtime infrastructure (RTI) for scheduling enclaves and distributed Lingua Franca programs. - * This file declares RTI features that are used by scheduling enclaves as well as federated - * LF programs. + * @brief Common declarations for runtime infrastructure (RTI) for scheduling enclaves + * and distributed Lingua Franca programs. */ #if defined STANDALONE_RTI || defined LF_ENCLAVES #ifndef RTI_COMMON_H @@ -112,9 +111,10 @@ typedef struct { void initialize_rti_common(rti_common_t * rti_common); /** - * An scheduling node calls this function after it completed a tag. - * The function updates the completed tag and check if the downstream scheduling nodes - * are eligible for receiving TAGs. + * @brief Update the completed tag for the specified node. + * + * This checks whether any downstream nodes become eligible to receive TAG + * or PTAG, and sends those signals if appropriate. * * The function is prepended with an underscore because a function called * `logical_tag_complete` is code-generated by the compiler. @@ -204,7 +204,7 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag); * If M is equal to the NET of the federate, then return PTAG(M). * * This should be called whenever an immediately upstream federate sends to - * the RTI an LTC (Logical Tag Complete), or when a transitive upstream + * the RTI an LTC (latest tag complete), or when a transitive upstream * federate sends a NET (Next Event Tag) message. * It is also called when an upstream federate resigns from the federation. * @@ -230,7 +230,7 @@ void update_scheduling_node_next_event_tag_locked(scheduling_node_t* e, tag_t ne /** * Given a node (enclave or federate), find the tag of the earliest possible incoming - * message from upstream enclaves or federates, which will be the smallest upstream NET + * message (EIMT) from upstream enclaves or federates, which will be the smallest upstream NET * plus the least delay. This could be NEVER_TAG if the RTI has not seen a NET from some * upstream node. * @param e The target node. @@ -238,6 +238,18 @@ void update_scheduling_node_next_event_tag_locked(scheduling_node_t* e, tag_t ne */ tag_t earliest_future_incoming_message_tag(scheduling_node_t* e); +/** + * Given a node (enclave or federate), find the earliest incoming message tag (EIMT) from + * any immediately upstream node that is not part of zero-delay cycle (ZDC). + * These tags are treated strictly by the RTI when deciding whether to grant a PTAG. + * Since the upstream node is not part of a ZDC, there is no need to block on the input + * from that node since we can simply wait for it to complete its tag without chance of + * introducing a deadlock. This will return FOREVER_TAG if there are no non-ZDC upstream nodes. + * @param e The target node. + * @return The earliest possible incoming message tag from a non-ZDC upstream node. + */ +tag_t eimt_strict(scheduling_node_t* e); + /** * Return true if the node is in a zero-delay cycle. * @param node The node. diff --git a/core/federated/RTI/rti_local.c b/core/federated/RTI/rti_local.c index 1f6cc0928..c75605426 100644 --- a/core/federated/RTI/rti_local.c +++ b/core/federated/RTI/rti_local.c @@ -7,17 +7,18 @@ * @copyright (c) 2020-2023, The University of California at Berkeley * License in [BSD 2-clause](https://github.com/lf-lang/reactor-c/blob/main/LICENSE.md) * - * This files implements the enclave coordination logic. + * This file implements the enclave coordination logic. * Here we are dealing with multiple mutexes. To avoid deadlocking we follow the * following rules: * 1) Mutexes are always locked in the following order: - * Enclave mutexes -> RTI mutex. + * Enclave mutexes followed by RTI mutex. * This means that we never lock an enclave mutex while holding the RTI mutex. * 2) Mutexes are always unlocked in the following order: - * RTI mutex -> Enclave mutex. - * 3) If the coordination logic might block. We unlock the enclave mutex - * -*/ + * RTI mutex followed by Enclave mutex. + * 3) If the coordination logic might block. We unlock the enclave mutex while + * blocking, using a condition variable to unblock. + * 4) When blocking on the coordination logic, never hold the RTI mutex. + */ #ifdef LF_ENCLAVES #include "rti_local.h" @@ -35,7 +36,7 @@ static rti_local_t * rti_local; lf_mutex_t rti_mutex; void initialize_local_rti(environment_t *envs, int num_envs) { - rti_local = (rti_local_t*)malloc(sizeof(rti_local_t)); + rti_local = (rti_local_t*)calloc(1, sizeof(rti_local_t)); LF_ASSERT(rti_local, "Out of memory"); initialize_rti_common(&rti_local->base); @@ -47,7 +48,7 @@ void initialize_local_rti(environment_t *envs, int num_envs) { // Allocate memory for the enclave_info objects rti_local->base.scheduling_nodes = (scheduling_node_t**)calloc(num_envs, sizeof(scheduling_node_t*)); for (int i = 0; i < num_envs; i++) { - enclave_info_t *enclave_info = (enclave_info_t *) malloc(sizeof(enclave_info_t)); + enclave_info_t *enclave_info = (enclave_info_t *) calloc(1, sizeof(enclave_info_t)); initialize_enclave_info(enclave_info, i, &envs[i]); rti_local->base.scheduling_nodes[i] = (scheduling_node_t *) enclave_info; diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 2fce8b1bf..373a109df 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -4,7 +4,7 @@ * @author Edward A. Lee * @author Soroush Bateni * @author Erling Jellum - * @author Chadlia Jerad + * @author Chadlia Jerad * @copyright (c) 2020-2023, The University of California at Berkeley * License in [BSD 2-clause](https://github.com/lf-lang/reactor-c/blob/main/LICENSE.md) * @brief Runtime infrastructure (RTI) for distributed Lingua Franca programs. @@ -30,7 +30,6 @@ #include "net_util.h" #include - // Global variables defined in tag.c: extern instant_t start_time; @@ -39,25 +38,45 @@ extern instant_t start_time; */ static rti_remote_t *rti_remote; +bool _lf_federate_reports_error = false; + // A convenient macro for getting the `federate_info_t *` at index `_idx` -// and casting it. -#define GET_FED_INFO(_idx) (federate_info_t *) rti_remote->base.scheduling_nodes[_idx] +// and casting it. +#define GET_FED_INFO(_idx) (federate_info_t *)rti_remote->base.scheduling_nodes[_idx] lf_mutex_t rti_mutex; lf_cond_t received_start_times; lf_cond_t sent_start_time; -extern int lf_critical_section_enter(environment_t* env) { +extern int lf_critical_section_enter(environment_t *env) { return lf_mutex_lock(&rti_mutex); } -extern int lf_critical_section_exit(environment_t* env) { +extern int lf_critical_section_exit(environment_t *env) { return lf_mutex_unlock(&rti_mutex); } -int create_server(int32_t specified_port, uint16_t port, socket_type_t socket_type) { +/** + * Create a server and enable listening for socket connections. + * If the specified port if it is non-zero, it will attempt to acquire that port. + * If it fails, it will repeatedly attempt up to PORT_BIND_RETRY_LIMIT times with + * a delay of PORT_BIND_RETRY_INTERVAL in between. If the specified port is + * zero, then it will attempt to acquire DEFAULT_PORT first. If this fails, then it + * will repeatedly attempt up to PORT_BIND_RETRY_LIMIT times, incrementing the port + * number between attempts, with no delay between attempts. Once it has incremented + * the port number MAX_NUM_PORT_ADDRESSES times, it will cycle around and begin again + * with DEFAULT_PORT. + * + * @param port The port number to use or 0 to start trying at DEFAULT_PORT. + * @param socket_type The type of the socket for the server (TCP or UDP). + * @return The socket descriptor on which to accept connections. + */ +static int create_rti_server(uint16_t port, socket_type_t socket_type) { // Timeout time for the communications of the server - struct timeval timeout_time = {.tv_sec = TCP_TIMEOUT_TIME / BILLION, .tv_usec = (TCP_TIMEOUT_TIME % BILLION) / 1000}; + struct timeval timeout_time = { + .tv_sec = TCP_TIMEOUT_TIME / BILLION, + .tv_usec = (TCP_TIMEOUT_TIME % BILLION) / 1000 + }; // Create an IPv4 socket for TCP (not UDP) communication over IP (0). int socket_descriptor = -1; if (socket_type == TCP) { @@ -65,29 +84,47 @@ int create_server(int32_t specified_port, uint16_t port, socket_type_t socket_ty } else if (socket_type == UDP) { socket_descriptor = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); // Set the appropriate timeout time - timeout_time = (struct timeval){.tv_sec = UDP_TIMEOUT_TIME / BILLION, .tv_usec = (UDP_TIMEOUT_TIME % BILLION) / 1000}; + timeout_time = (struct timeval){ + .tv_sec = UDP_TIMEOUT_TIME / BILLION, + .tv_usec = (UDP_TIMEOUT_TIME % BILLION) / 1000 + }; } if (socket_descriptor < 0) { - lf_print_error_and_exit("Failed to create RTI socket."); + lf_print_error_system_failure("Failed to create RTI socket."); } // Set the option for this socket to reuse the same address int true_variable = 1; // setsockopt() requires a reference to the value assigned to an option - if (setsockopt(socket_descriptor, SOL_SOCKET, SO_REUSEADDR, &true_variable, sizeof(int32_t)) < 0) { + if (setsockopt( + socket_descriptor, + SOL_SOCKET, + SO_REUSEADDR, + &true_variable, + sizeof(int32_t)) < 0) { lf_print_error("RTI failed to set SO_REUSEADDR option on the socket: %s.", strerror(errno)); } // Set the timeout on the socket so that read and write operations don't block for too long - if (setsockopt(socket_descriptor, SOL_SOCKET, SO_RCVTIMEO, (const char*)&timeout_time, sizeof(timeout_time)) < 0) { + if (setsockopt( + socket_descriptor, + SOL_SOCKET, + SO_RCVTIMEO, + (const char *)&timeout_time, + sizeof(timeout_time)) < 0) { lf_print_error("RTI failed to set SO_RCVTIMEO option on the socket: %s.", strerror(errno)); } - if (setsockopt(socket_descriptor, SOL_SOCKET, SO_SNDTIMEO, (const char*)&timeout_time, sizeof(timeout_time)) < 0) { + if (setsockopt( + socket_descriptor, + SOL_SOCKET, + SO_SNDTIMEO, + (const char *)&timeout_time, + sizeof(timeout_time)) < 0) { lf_print_error("RTI failed to set SO_SNDTIMEO option on the socket: %s.", strerror(errno)); } /* * The following used to permit reuse of a port that an RTI has previously - * used that has not been released. We no longer do this, but instead - * increment the port number until an available port is found. + * used that has not been released. We no longer do this, and instead retry + * some number of times after waiting. // SO_REUSEPORT (since Linux 3.9) // Permits multiple AF_INET or AF_INET6 sockets to be bound to an @@ -99,11 +136,6 @@ int create_server(int32_t specified_port, uint16_t port, socket_type_t socket_ty // UDP sockets. int reuse = 1; - if (setsockopt(socket_descriptor, SOL_SOCKET, SO_REUSEADDR, - (const char*)&reuse, sizeof(reuse)) < 0) { - perror("setsockopt(SO_REUSEADDR) failed"); - } - #ifdef SO_REUSEPORT if (setsockopt(socket_descriptor, SOL_SOCKET, SO_REUSEPORT, (const char*)&reuse, sizeof(reuse)) < 0) { @@ -115,42 +147,46 @@ int create_server(int32_t specified_port, uint16_t port, socket_type_t socket_ty // Server file descriptor. struct sockaddr_in server_fd; // Zero out the server address structure. - bzero((char *) &server_fd, sizeof(server_fd)); + bzero((char *)&server_fd, sizeof(server_fd)); - server_fd.sin_family = AF_INET; // IPv4 - server_fd.sin_addr.s_addr = INADDR_ANY; // All interfaces, 0.0.0.0. + uint16_t specified_port = port; + if (specified_port == 0) port = DEFAULT_PORT; + + server_fd.sin_family = AF_INET; // IPv4 + server_fd.sin_addr.s_addr = INADDR_ANY; // All interfaces, 0.0.0.0. // Convert the port number from host byte order to network byte order. server_fd.sin_port = htons(port); int result = bind( socket_descriptor, - (struct sockaddr *) &server_fd, + (struct sockaddr *)&server_fd, sizeof(server_fd)); - // If the binding fails with this port and no particular port was specified - // in the LF program, then try the next few ports in sequence. - while (result != 0 - && specified_port == 0 - && port >= STARTING_PORT - && port <= STARTING_PORT + PORT_RANGE_LIMIT) { - lf_print("RTI failed to get port %d. Trying %d.", port, port + 1); - port++; - server_fd.sin_port = htons(port); - result = bind( - socket_descriptor, - (struct sockaddr *) &server_fd, - sizeof(server_fd)); - } - if (result != 0) { + // Try repeatedly to bind to a port. If no specific port is specified, then + // increment the port number each time. + + int count = 1; + while (result != 0 && count++ < PORT_BIND_RETRY_LIMIT) { if (specified_port == 0) { - lf_print_error_and_exit("Failed to bind the RTI socket. Cannot find a usable port. " - "Consider increasing PORT_RANGE_LIMIT in net_common.h."); + lf_print_warning("RTI failed to get port %d.", port); + port++; + if (port >= DEFAULT_PORT + MAX_NUM_PORT_ADDRESSES) port = DEFAULT_PORT; + lf_print_warning("RTI will try again with port %d.", port); + server_fd.sin_port = htons(port); + // Do not sleep. } else { - lf_print_error_and_exit("Failed to bind the RTI socket. Specified port is not available. " - "Consider leaving the port unspecified"); + lf_print("RTI failed to get port %d. Will try again.", port); + lf_sleep(PORT_BIND_RETRY_INTERVAL); } + result = bind( + socket_descriptor, + (struct sockaddr *)&server_fd, + sizeof(server_fd)); + } + if (result != 0) { + lf_print_error_and_exit("Failed to bind the RTI socket. Port %d is not available. ", port); } - char* type = "TCP"; + char *type = "TCP"; if (socket_type == UDP) { type = "UDP"; } @@ -170,11 +206,10 @@ int create_server(int32_t specified_port, uint16_t port, socket_type_t socket_ty return socket_descriptor; } -void notify_tag_advance_grant(scheduling_node_t* e, tag_t tag) { +void notify_tag_advance_grant(scheduling_node_t *e, tag_t tag) { if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 - || lf_tag_compare(tag, e->last_provisionally_granted) < 0 - ) { + || lf_tag_compare(tag, e->last_provisionally_granted) < 0) { return; } // Need to make sure that the destination federate's thread has already @@ -195,25 +230,20 @@ void notify_tag_advance_grant(scheduling_node_t* e, tag_t tag) { // This function is called in notify_advance_grant_if_safe(), which is a long // function. During this call, the socket might close, causing the following write_to_socket // to fail. Consider a failure here a soft failure and update the federate's status. - ssize_t bytes_written = write_to_socket(((federate_info_t*)e)->socket, message_length, buffer); - if (bytes_written < (ssize_t)message_length) { + if (write_to_socket(((federate_info_t *)e)->socket, message_length, buffer)) { lf_print_error("RTI failed to send tag advance grant to federate %d.", e->id); - if (bytes_written < 0) { - e->state = NOT_CONNECTED; - // FIXME: We need better error handling, but don't stop other execution here. - } + e->state = NOT_CONNECTED; } else { e->last_granted = tag; LF_PRINT_LOG("RTI sent to federate %d the tag advance grant (TAG) " PRINTF_TAG ".", - e->id, tag.time - start_time, tag.microstep); + e->id, tag.time - start_time, tag.microstep); } } -void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { +void notify_provisional_tag_advance_grant(scheduling_node_t *e, tag_t tag) { if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 - || lf_tag_compare(tag, e->last_provisionally_granted) <= 0 - ) { + || lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { return; } // Need to make sure that the destination federate's thread has already @@ -228,20 +258,15 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { encode_int64(tag.time, &(buffer[1])); encode_int32((int32_t)tag.microstep, &(buffer[1 + sizeof(int64_t)])); - if (rti_remote->base.tracing_enabled){ + if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(rti_remote->base.trace, send_PTAG, e->id, &tag); } // This function is called in notify_advance_grant_if_safe(), which is a long // function. During this call, the socket might close, causing the following write_to_socket // to fail. Consider a failure here a soft failure and update the federate's status. - ssize_t bytes_written = write_to_socket(((federate_info_t*)e)->socket, message_length, buffer); - - if (bytes_written < (ssize_t)message_length) { + if (write_to_socket(((federate_info_t *)e)->socket, message_length, buffer)) { lf_print_error("RTI failed to send tag advance grant to federate %d.", e->id); - if (bytes_written < 0) { - e->state = NOT_CONNECTED; - // FIXME: We need better error handling, but don't stop other execution here. - } + e->state = NOT_CONNECTED; } else { e->last_provisionally_granted = tag; LF_PRINT_LOG("RTI sent to federate %d the Provisional Tag Advance Grant (PTAG) " PRINTF_TAG ".", @@ -251,22 +276,26 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { // a later or equal PTAG or TAG sent previously and if their transitive // NET is greater than or equal to the tag. // This is needed to stimulate absent messages from upstream and break deadlocks. - // NOTE: This could later be replaced with a TNET mechanism once - // we have an available encoding of causality interfaces. - // That might be more efficient. + // The scenario this deals with is illustrated in `test/C/src/federated/FeedbackDelay2.lf` + // and `test/C/src/federated/FeedbackDelay4.lf`. + // Note that this is transitive. // NOTE: This is not needed for enclaves because zero-delay loops are prohibited. // It's only needed for federates, which is why this is implemented here. for (int j = 0; j < e->num_upstream; j++) { - scheduling_node_t* upstream = rti_remote->base.scheduling_nodes[e->upstream[j]]; + scheduling_node_t *upstream = rti_remote->base.scheduling_nodes[e->upstream[j]]; // Ignore this federate if it has resigned. - if (upstream->state == NOT_CONNECTED) continue; + if (upstream->state == NOT_CONNECTED) + continue; tag_t earliest = earliest_future_incoming_message_tag(upstream); + tag_t strict_earliest = eimt_strict(upstream); // Non-ZDC version. // If these tags are equal, then a TAG or PTAG should have already been granted, // in which case, another will not be sent. But it may not have been already granted. - if (lf_tag_compare(earliest, tag) >= 0) { + if (lf_tag_compare(earliest, tag) > 0) { + notify_tag_advance_grant(upstream, tag); + } else if (lf_tag_compare(earliest, tag) == 0 && lf_tag_compare(strict_earliest, tag) > 0) { notify_provisional_tag_advance_grant(upstream, tag); } } @@ -274,24 +303,21 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { } void update_federate_next_event_tag_locked(uint16_t federate_id, tag_t next_event_tag) { - federate_info_t* fed = GET_FED_INFO(federate_id); - tag_t min_in_transit_tag = get_minimum_in_transit_message_tag(fed->in_transit_message_tags); - if (lf_tag_compare( - min_in_transit_tag, - next_event_tag - ) < 0 - ) { + federate_info_t *fed = GET_FED_INFO(federate_id); + tag_t min_in_transit_tag = pqueue_tag_peek_tag(fed->in_transit_message_tags); + if (lf_tag_compare(min_in_transit_tag, next_event_tag) < 0) { next_event_tag = min_in_transit_tag; } update_scheduling_node_next_event_tag_locked(&(fed->enclave), next_event_tag); } -void handle_port_absent_message(federate_info_t* sending_federate, unsigned char* buffer) { +void handle_port_absent_message(federate_info_t *sending_federate, unsigned char *buffer) { size_t message_size = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int64_t) + sizeof(uint32_t); - read_from_socket_errexit(sending_federate->socket, message_size, &(buffer[1]), - " RTI failed to read port absent message from federate %u.", - sending_federate->enclave.id); + read_from_socket_fail_on_error( + &sending_federate->socket, message_size, &(buffer[1]), NULL, + " RTI failed to read port absent message from federate %u.", + sending_federate->enclave.id); uint16_t reactor_port_id = extract_uint16(&(buffer[1])); uint16_t federate_id = extract_uint16(&(buffer[1 + sizeof(uint16_t)])); @@ -304,34 +330,33 @@ void handle_port_absent_message(federate_info_t* sending_federate, unsigned char // Need to acquire the mutex lock to ensure that the thread handling // messages coming from the socket connected to the destination does not // issue a TAG before this message has been forwarded. - lf_mutex_lock(&rti_mutex); + LF_MUTEX_LOCK(rti_mutex); // If the destination federate is no longer connected, issue a warning // and return. - federate_info_t* fed = GET_FED_INFO(federate_id); + federate_info_t *fed = GET_FED_INFO(federate_id); if (fed->enclave.state == NOT_CONNECTED) { - lf_mutex_unlock(&rti_mutex); + LF_MUTEX_UNLOCK(rti_mutex); lf_print_warning("RTI: Destination federate %d is no longer connected. Dropping message.", - federate_id); + federate_id); LF_PRINT_LOG("Fed status: next_event (" PRINTF_TIME ", %d), " - "completed (" PRINTF_TIME ", %d), " - "last_granted (" PRINTF_TIME ", %d), " - "last_provisionally_granted (" PRINTF_TIME ", %d).", - fed->enclave.next_event.time - start_time, - fed->enclave.next_event.microstep, - fed->enclave.completed.time - start_time, - fed->enclave.completed.microstep, - fed->enclave.last_granted.time - start_time, - fed->enclave.last_granted.microstep, - fed->enclave.last_provisionally_granted.time - start_time, - fed->enclave.last_provisionally_granted.microstep - ); + "completed (" PRINTF_TIME ", %d), " + "last_granted (" PRINTF_TIME ", %d), " + "last_provisionally_granted (" PRINTF_TIME ", %d).", + fed->enclave.next_event.time - start_time, + fed->enclave.next_event.microstep, + fed->enclave.completed.time - start_time, + fed->enclave.completed.microstep, + fed->enclave.last_granted.time - start_time, + fed->enclave.last_granted.microstep, + fed->enclave.last_provisionally_granted.time - start_time, + fed->enclave.last_provisionally_granted.microstep); return; } LF_PRINT_LOG("RTI forwarding port absent message for port %u to federate %u.", - reactor_port_id, - federate_id); + reactor_port_id, + federate_id); // Need to make sure that the destination federate's thread has already // sent the starting MSG_TYPE_TIMESTAMP message. @@ -340,21 +365,24 @@ void handle_port_absent_message(federate_info_t* sending_federate, unsigned char lf_cond_wait(&sent_start_time); } - // Forward the message. - int destination_socket = fed->socket; if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(rti_remote->base.trace, send_PORT_ABS, federate_id, &tag); } - write_to_socket_errexit(destination_socket, message_size + 1, buffer, - "RTI failed to forward message to federate %d.", federate_id); - lf_mutex_unlock(&rti_mutex); + // Forward the message. + write_to_socket_fail_on_error(&fed->socket, message_size + 1, buffer, &rti_mutex, + "RTI failed to forward message to federate %d.", federate_id); + + LF_MUTEX_UNLOCK(rti_mutex); } -void handle_timed_message(federate_info_t* sending_federate, unsigned char* buffer) { - size_t header_size = 1 + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t) + sizeof(int64_t) + sizeof(uint32_t); +void handle_timed_message(federate_info_t *sending_federate, unsigned char *buffer) { + size_t header_size = 1 + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t) + + sizeof(int64_t) + sizeof(uint32_t); // Read the header, minus the first byte which has already been read. - read_from_socket_errexit(sending_federate->socket, header_size - 1, &(buffer[1]), "RTI failed to read the timed message header from remote federate."); + read_from_socket_fail_on_error( + &sending_federate->socket, header_size - 1, &(buffer[1]), NULL, + "RTI failed to read the timed message header from remote federate."); // Extract the header information. of the sender uint16_t reactor_port_id; uint16_t federate_id; @@ -368,8 +396,8 @@ void handle_timed_message(federate_info_t* sending_federate, unsigned char* buff if (FED_COM_BUFFER_SIZE < header_size + 1) { lf_print_error_and_exit("Buffer size (%d) is not large enough to " - "read the header plus one byte.", - FED_COM_BUFFER_SIZE); + "read the header plus one byte.", + FED_COM_BUFFER_SIZE); } // Cut up the payload in chunks. @@ -377,13 +405,13 @@ void handle_timed_message(federate_info_t* sending_federate, unsigned char* buff bytes_to_read = FED_COM_BUFFER_SIZE - header_size; } - LF_PRINT_LOG("RTI received message from federate %d for federate %u port %u with intended tag " - PRINTF_TAG ". Forwarding.", - sending_federate->enclave.id, federate_id, reactor_port_id, - intended_tag.time - lf_time_start(), intended_tag.microstep); + LF_PRINT_LOG("RTI received message from federate %d for federate %u port %u with intended tag " PRINTF_TAG ". Forwarding.", + sending_federate->enclave.id, federate_id, reactor_port_id, + intended_tag.time - lf_time_start(), intended_tag.microstep); - read_from_socket_errexit(sending_federate->socket, bytes_to_read, &(buffer[header_size]), - "RTI failed to read timed message from federate %d.", federate_id); + read_from_socket_fail_on_error( + &sending_federate->socket, bytes_to_read, &(buffer[header_size]), NULL, + "RTI failed to read timed message from federate %d.", federate_id); size_t bytes_read = bytes_to_read + header_size; // Following only works for string messages. // LF_PRINT_DEBUG("Message received by RTI: %s.", buffer + header_size); @@ -395,68 +423,35 @@ void handle_timed_message(federate_info_t* sending_federate, unsigned char* buff // Need to acquire the mutex lock to ensure that the thread handling // messages coming from the socket connected to the destination does not // issue a TAG before this message has been forwarded. - lf_mutex_lock(&rti_mutex); + LF_MUTEX_LOCK(rti_mutex); // If the destination federate is no longer connected, issue a warning // and return. federate_info_t *fed = GET_FED_INFO(federate_id); if (fed->enclave.state == NOT_CONNECTED) { - lf_mutex_unlock(&rti_mutex); + LF_MUTEX_UNLOCK(rti_mutex); lf_print_warning("RTI: Destination federate %d is no longer connected. Dropping message.", - federate_id); + federate_id); LF_PRINT_LOG("Fed status: next_event (" PRINTF_TIME ", %d), " - "completed (" PRINTF_TIME ", %d), " - "last_granted (" PRINTF_TIME ", %d), " - "last_provisionally_granted (" PRINTF_TIME ", %d).", - fed->enclave.next_event.time - start_time, - fed->enclave.next_event.microstep, - fed->enclave.completed.time - start_time, - fed->enclave.completed.microstep, - fed->enclave.last_granted.time - start_time, - fed->enclave.last_granted.microstep, - fed->enclave.last_provisionally_granted.time - start_time, - fed->enclave.last_provisionally_granted.microstep - ); + "completed (" PRINTF_TIME ", %d), " + "last_granted (" PRINTF_TIME ", %d), " + "last_provisionally_granted (" PRINTF_TIME ", %d).", + fed->enclave.next_event.time - start_time, + fed->enclave.next_event.microstep, + fed->enclave.completed.time - start_time, + fed->enclave.completed.microstep, + fed->enclave.last_granted.time - start_time, + fed->enclave.last_granted.microstep, + fed->enclave.last_provisionally_granted.time - start_time, + fed->enclave.last_provisionally_granted.microstep); return; } - // Forward the message or message chunk. - int destination_socket = fed->socket; - LF_PRINT_DEBUG( "RTI forwarding message to port %d of federate %hu of length %zu.", reactor_port_id, federate_id, - length - ); - - // Record this in-transit message in federate's in-transit message queue. - if (lf_tag_compare(fed->enclave.completed, intended_tag) < 0) { - // Add a record of this message to the list of in-transit messages to this federate. - add_in_transit_message_record( - fed->in_transit_message_tags, - intended_tag - ); - LF_PRINT_DEBUG( - "RTI: Adding a message with tag " PRINTF_TAG " to the list of in-transit messages for federate %d.", - intended_tag.time - lf_time_start(), - intended_tag.microstep, - federate_id - ); - } else { - lf_print_error( - "RTI: Federate %d has already completed tag " PRINTF_TAG - ", but there is an in-transit message with tag " PRINTF_TAG " from federate %hu. " - "This is going to cause an STP violation under centralized coordination.", - federate_id, - fed->enclave.completed.time - lf_time_start(), - fed->enclave.completed.microstep, - intended_tag.time - lf_time_start(), - intended_tag.microstep, - sending_federate->enclave.id - ); - // FIXME: Drop the federate? - } + length); // Need to make sure that the destination federate's thread has already // sent the starting MSG_TYPE_TIMESTAMP message. @@ -469,7 +464,7 @@ void handle_timed_message(federate_info_t* sending_federate, unsigned char* buff tracepoint_rti_to_federate(rti_remote->base.trace, send_TAGGED_MSG, federate_id, &intended_tag); } - write_to_socket_errexit(destination_socket, bytes_read, buffer, + write_to_socket_fail_on_error(&fed->socket, bytes_read, buffer, &rti_mutex, "RTI failed to forward message to federate %d.", federate_id); // The message length may be longer than the buffer, @@ -481,7 +476,7 @@ void handle_timed_message(federate_info_t* sending_federate, unsigned char* buff if (bytes_to_read > FED_COM_BUFFER_SIZE) { bytes_to_read = FED_COM_BUFFER_SIZE; } - read_from_socket_errexit(sending_federate->socket, bytes_to_read, buffer, + read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, buffer, NULL, "RTI failed to read message chunks."); total_bytes_read += bytes_to_read; @@ -489,57 +484,84 @@ void handle_timed_message(federate_info_t* sending_federate, unsigned char* buff // do not write to destination_socket and cause interleaving. However, // holding the rti_mutex might be very expensive. Instead, each outgoing // socket should probably have its own mutex. - write_to_socket_errexit(destination_socket, bytes_to_read, buffer, - "RTI failed to send message chunks."); + write_to_socket_fail_on_error(&fed->socket, bytes_to_read, buffer, &rti_mutex, + "RTI failed to send message chunks."); + } + + // Record this in-transit message in federate's in-transit message queue. + if (lf_tag_compare(fed->enclave.completed, intended_tag) < 0) { + // Add a record of this message to the list of in-transit messages to this federate. + pqueue_tag_insert_if_no_match( + fed->in_transit_message_tags, + intended_tag); + LF_PRINT_DEBUG( + "RTI: Adding a message with tag " PRINTF_TAG " to the list of in-transit messages for federate %d.", + intended_tag.time - lf_time_start(), + intended_tag.microstep, + federate_id); + } else { + lf_print_error( + "RTI: Federate %d has already completed tag " PRINTF_TAG + ", but there is an in-transit message with tag " PRINTF_TAG " from federate %hu. " + "This is going to cause an STP violation under centralized coordination.", + federate_id, + fed->enclave.completed.time - lf_time_start(), + fed->enclave.completed.microstep, + intended_tag.time - lf_time_start(), + intended_tag.microstep, + sending_federate->enclave.id); + // FIXME: Drop the federate? } - update_federate_next_event_tag_locked(federate_id, intended_tag); + // If the message tag is less than the most recently received NET from the federate, + // then update the federate's next event tag to match the message tag. + if (lf_tag_compare(intended_tag, fed->enclave.next_event) < 0) { + update_federate_next_event_tag_locked(federate_id, intended_tag); + } - lf_mutex_unlock(&rti_mutex); + LF_MUTEX_UNLOCK(rti_mutex); } -void handle_logical_tag_complete(federate_info_t* fed) { +void handle_latest_tag_complete(federate_info_t *fed) { unsigned char buffer[sizeof(int64_t) + sizeof(uint32_t)]; - read_from_socket_errexit(fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, - "RTI failed to read the content of the logical tag complete from federate %d.", fed->enclave.id); + read_from_socket_fail_on_error(&fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, NULL, + "RTI failed to read the content of the logical tag complete from federate %d.", + fed->enclave.id); tag_t completed = extract_tag(buffer); - if (rti_remote->base.tracing_enabled) { + if (rti_remote->base.tracing_enabled) { tracepoint_rti_from_federate(rti_remote->base.trace, receive_LTC, fed->enclave.id, &completed); } _logical_tag_complete(&(fed->enclave), completed); // FIXME: Should this function be in the enclave version? - lf_mutex_lock(&rti_mutex); + LF_MUTEX_LOCK(rti_mutex); // See if we can remove any of the recorded in-transit messages for this. - clean_in_transit_message_record_up_to_tag(fed->in_transit_message_tags, fed->enclave.completed); - lf_mutex_unlock(&rti_mutex); + pqueue_tag_remove_up_to(fed->in_transit_message_tags, completed); + LF_MUTEX_UNLOCK(rti_mutex); } -void handle_next_event_tag(federate_info_t* fed) { +void handle_next_event_tag(federate_info_t *fed) { unsigned char buffer[sizeof(int64_t) + sizeof(uint32_t)]; - read_from_socket_errexit(fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, - "RTI failed to read the content of the next event tag from federate %d.", fed->enclave.id); + read_from_socket_fail_on_error(&fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, NULL, + "RTI failed to read the content of the next event tag from federate %d.", + fed->enclave.id); // Acquire a mutex lock to ensure that this state does not change while a // message is in transport or being used to determine a TAG. - lf_mutex_lock(&rti_mutex); // FIXME: Instead of using a mutex, - // it might be more efficient to use a - // select() mechanism to read and process - // federates' buffers in an orderly fashion. - + LF_MUTEX_LOCK(rti_mutex); // FIXME: Instead of using a mutex, it might be more efficient to use a + // select() mechanism to read and process federates' buffers in an orderly fashion. tag_t intended_tag = extract_tag(buffer); if (rti_remote->base.tracing_enabled) { tracepoint_rti_from_federate(rti_remote->base.trace, receive_NET, fed->enclave.id, &intended_tag); } LF_PRINT_LOG("RTI received from federate %d the Next Event Tag (NET) " PRINTF_TAG, - fed->enclave.id, intended_tag.time - start_time, - intended_tag.microstep); + fed->enclave.id, intended_tag.time - start_time, + intended_tag.microstep); update_federate_next_event_tag_locked( fed->enclave.id, - intended_tag - ); - lf_mutex_unlock(&rti_mutex); + intended_tag); + LF_MUTEX_UNLOCK(rti_mutex); } /////////////////// STOP functions //////////////////// @@ -548,7 +570,7 @@ void handle_next_event_tag(federate_info_t* fed) { * Boolean used to prevent the RTI from sending the * MSG_TYPE_STOP_GRANTED message multiple times. */ -bool _lf_rti_stop_granted_already_sent_to_federates = false; +bool stop_granted_already_sent_to_federates = false; /** * Once the RTI has seen proposed tags from all connected federates, @@ -556,12 +578,14 @@ bool _lf_rti_stop_granted_already_sent_to_federates = false; * This function also checks the most recently received NET from * each federate and resets that be no greater than the _RTI.max_stop_tag. * - * This function assumes the caller holds the _RTI.rti_mutex lock. + * This function assumes the caller holds the rti_mutex lock. */ -void _lf_rti_broadcast_stop_time_to_federates_locked() { - if (_lf_rti_stop_granted_already_sent_to_federates == true) { +static void broadcast_stop_time_to_federates_locked() { + if (stop_granted_already_sent_to_federates == true) { return; } + stop_granted_already_sent_to_federates = true; + // Reply with a stop granted to all federates unsigned char outgoing_buffer[MSG_TYPE_STOP_GRANTED_LENGTH]; ENCODE_STOP_GRANTED(outgoing_buffer, rti_remote->base.max_stop_tag.time, rti_remote->base.max_stop_tag.microstep); @@ -579,88 +603,121 @@ void _lf_rti_broadcast_stop_time_to_federates_locked() { if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP_GRN, fed->enclave.id, &rti_remote->base.max_stop_tag); } - write_to_socket_errexit(fed->socket, MSG_TYPE_STOP_GRANTED_LENGTH, outgoing_buffer, + write_to_socket_fail_on_error( + &fed->socket, MSG_TYPE_STOP_GRANTED_LENGTH, outgoing_buffer, &rti_mutex, "RTI failed to send MSG_TYPE_STOP_GRANTED message to federate %d.", fed->enclave.id); } - LF_PRINT_LOG("RTI sent to federates MSG_TYPE_STOP_GRANTED with tag (" PRINTF_TIME ", %u).", - rti_remote->base.max_stop_tag.time - start_time, - rti_remote->base.max_stop_tag.microstep); - _lf_rti_stop_granted_already_sent_to_federates = true; + LF_PRINT_LOG("RTI sent to federates MSG_TYPE_STOP_GRANTED with tag " PRINTF_TAG, + rti_remote->base.max_stop_tag.time - start_time, + rti_remote->base.max_stop_tag.microstep); } -void mark_federate_requesting_stop(federate_info_t* fed) { +/** + * Mark a federate requesting stop. If the number of federates handling stop reaches the + * NUM_OF_FEDERATES, broadcast MSG_TYPE_STOP_GRANTED to every federate. + * This function assumes the _RTI.mutex is already locked. + * @param fed The federate that has requested a stop. + * @return 1 if stop time has been sent to all federates and 0 otherwise. + */ +static int mark_federate_requesting_stop(federate_info_t *fed) { if (!fed->requested_stop) { - // Assume that the federate - // has requested stop rti_remote->base.num_scheduling_nodes_handling_stop++; fed->requested_stop = true; } - if (rti_remote->base.num_scheduling_nodes_handling_stop == rti_remote->base.number_of_scheduling_nodes) { + if (rti_remote->base.num_scheduling_nodes_handling_stop + == rti_remote->base.number_of_scheduling_nodes) { // We now have information about the stop time of all // federates. - _lf_rti_broadcast_stop_time_to_federates_locked(); + broadcast_stop_time_to_federates_locked(); + return 1; } + return 0; +} + +/** + * Thread to time out if federates do not reply to stop request. + */ +static void* wait_for_stop_request_reply(void* args) { + // Divide the time into small chunks and check periodically. + interval_t chunk = MAX_TIME_FOR_REPLY_TO_STOP_REQUEST/30; + int count = 0; + while (count++ < 30) { + if (stop_granted_already_sent_to_federates) return NULL; + lf_sleep(chunk); + } + // If we reach here, then error out. + lf_print_error_and_exit("Received only %d stop request replies within timeout " + PRINTF_TIME "ns. RTI is exiting.", + rti_remote->base.num_scheduling_nodes_handling_stop, + MAX_TIME_FOR_REPLY_TO_STOP_REQUEST + ); + return NULL; } -void handle_stop_request_message(federate_info_t* fed) { +void handle_stop_request_message(federate_info_t *fed) { LF_PRINT_DEBUG("RTI handling stop_request from federate %d.", fed->enclave.id); size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_LENGTH - 1; unsigned char buffer[bytes_to_read]; - read_from_socket_errexit(fed->socket, bytes_to_read, buffer, - "RTI failed to read the MSG_TYPE_STOP_REQUEST payload from federate %d.", fed->enclave.id); + read_from_socket_fail_on_error(&fed->socket, bytes_to_read, buffer, NULL, + "RTI failed to read the MSG_TYPE_STOP_REQUEST payload from federate %d.", + fed->enclave.id); + + // Extract the proposed stop tag for the federate + tag_t proposed_stop_tag = extract_tag(buffer); + + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(rti_remote->base.trace, receive_STOP_REQ, fed->enclave.id, &proposed_stop_tag); + } + + LF_PRINT_LOG("RTI received from federate %d a MSG_TYPE_STOP_REQUEST message with tag " PRINTF_TAG ".", + fed->enclave.id, proposed_stop_tag.time - start_time, proposed_stop_tag.microstep); // Acquire a mutex lock to ensure that this state does change while a // message is in transport or being used to determine a TAG. - lf_mutex_lock(&rti_mutex); + LF_MUTEX_LOCK(rti_mutex); // Check whether we have already received a stop_tag // from this federate if (fed->requested_stop) { - // Ignore this request - lf_mutex_unlock(&rti_mutex); + // If stop request messages have already been broadcast, treat this as if it were a reply. + if (rti_remote->stop_in_progress) { + mark_federate_requesting_stop(fed); + } + LF_MUTEX_UNLOCK(rti_mutex); return; } - // Extract the proposed stop tag for the federate - tag_t proposed_stop_tag = extract_tag(buffer); - - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(rti_remote->base.trace, receive_STOP_REQ, fed->enclave.id, &proposed_stop_tag); - } - // Update the maximum stop tag received from federates if (lf_tag_compare(proposed_stop_tag, rti_remote->base.max_stop_tag) > 0) { rti_remote->base.max_stop_tag = proposed_stop_tag; } - LF_PRINT_LOG("RTI received from federate %d a MSG_TYPE_STOP_REQUEST message with tag " PRINTF_TAG ".", - fed->enclave.id, proposed_stop_tag.time - start_time, proposed_stop_tag.microstep); - - // If this federate has not already asked - // for a stop, add it to the tally. - mark_federate_requesting_stop(fed); - - if (rti_remote->base.num_scheduling_nodes_handling_stop == rti_remote->base.number_of_scheduling_nodes) { - // We now have information about the stop time of all - // federates. This is extremely unlikely, but it can occur - // all federates call lf_request_stop() at the same tag. - lf_mutex_unlock(&rti_mutex); + // If all federates have replied, send stop request granted. + if (mark_federate_requesting_stop(fed)) { + // Have send stop request granted to all federates. Nothing more to do. + LF_MUTEX_UNLOCK(rti_mutex); return; } + // Forward the stop request to all other federates that have not // also issued a stop request. unsigned char stop_request_buffer[MSG_TYPE_STOP_REQUEST_LENGTH]; - ENCODE_STOP_REQUEST(stop_request_buffer, rti_remote->base.max_stop_tag.time, rti_remote->base.max_stop_tag.microstep); + ENCODE_STOP_REQUEST(stop_request_buffer, + rti_remote->base.max_stop_tag.time, rti_remote->base.max_stop_tag.microstep); // Iterate over federates and send each the MSG_TYPE_STOP_REQUEST message // if we do not have a stop_time already for them. Do not do this more than once. if (rti_remote->stop_in_progress) { - lf_mutex_unlock(&rti_mutex); + LF_MUTEX_UNLOCK(rti_mutex); return; } rti_remote->stop_in_progress = true; + // Need a timeout here in case a federate never replies. + lf_thread_t timeout_thread; + lf_thread_create(&timeout_thread, wait_for_stop_request_reply, NULL); + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { federate_info_t *f = GET_FED_INFO(i); if (f->enclave.id != fed->enclave.id && f->requested_stop == false) { @@ -671,24 +728,22 @@ void handle_stop_request_message(federate_info_t* fed) { if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP_REQ, f->enclave.id, &rti_remote->base.max_stop_tag); } - write_to_socket_errexit(f->socket, MSG_TYPE_STOP_REQUEST_LENGTH, stop_request_buffer, + write_to_socket_fail_on_error(&f->socket, MSG_TYPE_STOP_REQUEST_LENGTH, stop_request_buffer, &rti_mutex, "RTI failed to forward MSG_TYPE_STOP_REQUEST message to federate %d.", f->enclave.id); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP_REQ, f->enclave.id, &rti_remote->base.max_stop_tag); - } } } LF_PRINT_LOG("RTI forwarded to federates MSG_TYPE_STOP_REQUEST with tag (" PRINTF_TIME ", %u).", - rti_remote->base.max_stop_tag.time - start_time, - rti_remote->base.max_stop_tag.microstep); - lf_mutex_unlock(&rti_mutex); + rti_remote->base.max_stop_tag.time - start_time, + rti_remote->base.max_stop_tag.microstep); + LF_MUTEX_UNLOCK(rti_mutex); } -void handle_stop_request_reply(federate_info_t* fed) { +void handle_stop_request_reply(federate_info_t *fed) { size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_REPLY_LENGTH - 1; unsigned char buffer_stop_time[bytes_to_read]; - read_from_socket_errexit(fed->socket, bytes_to_read, buffer_stop_time, - "RTI failed to read the reply to MSG_TYPE_STOP_REQUEST message from federate %d.", fed->enclave.id); + read_from_socket_fail_on_error(&fed->socket, bytes_to_read, buffer_stop_time, NULL, + "RTI failed to read the reply to MSG_TYPE_STOP_REQUEST message from federate %d.", + fed->enclave.id); tag_t federate_stop_tag = extract_tag(buffer_stop_time); @@ -701,13 +756,13 @@ void handle_stop_request_reply(federate_info_t* fed) { federate_stop_tag.microstep); // Acquire the mutex lock so that we can change the state of the RTI - lf_mutex_lock(&rti_mutex); + LF_MUTEX_LOCK(rti_mutex); // If the federate has not requested stop before, count the reply if (lf_tag_compare(federate_stop_tag, rti_remote->base.max_stop_tag) > 0) { rti_remote->base.max_stop_tag = federate_stop_tag; } mark_federate_requesting_stop(fed); - lf_mutex_unlock(&rti_mutex); + LF_MUTEX_UNLOCK(rti_mutex); } ////////////////////////////////////////////////// @@ -716,14 +771,12 @@ void handle_address_query(uint16_t fed_id) { federate_info_t *fed = GET_FED_INFO(fed_id); // Use buffer both for reading and constructing the reply. // The length is what is needed for the reply. - unsigned char buffer[sizeof(int32_t)]; - ssize_t bytes_read = read_from_socket(fed->socket, sizeof(uint16_t), (unsigned char*)buffer); - if (bytes_read == 0) { - lf_print_error_and_exit("Failed to read address query."); - } + unsigned char buffer[1 + sizeof(int32_t)]; + read_from_socket_fail_on_error(&fed->socket, sizeof(uint16_t), (unsigned char *)buffer, NULL, + "Failed to read address query."); uint16_t remote_fed_id = extract_uint16(buffer); - if (rti_remote->base.tracing_enabled){ + if (rti_remote->base.tracing_enabled) { tracepoint_rti_from_federate(rti_remote->base.trace, receive_ADR_QR, fed_id, NULL); } @@ -733,22 +786,28 @@ void handle_address_query(uint16_t fed_id) { // the port number because it has not yet received an MSG_TYPE_ADDRESS_ADVERTISEMENT message // from this federate. In that case, it will respond by sending -1. + // Response message is also of type MSG_TYPE_ADDRESS_QUERY. + buffer[0] = MSG_TYPE_ADDRESS_QUERY; + // Encode the port number. federate_info_t *remote_fed = GET_FED_INFO(remote_fed_id); - encode_int32(remote_fed->server_port, (unsigned char*)buffer); + // Send the port number (which could be -1). - write_to_socket_errexit(fed->socket, sizeof(int32_t), (unsigned char*)buffer, - "Failed to write port number to socket of federate %d.", fed_id); + LF_MUTEX_LOCK(rti_mutex); + encode_int32(remote_fed->server_port, (unsigned char *)&buffer[1]); + write_to_socket_fail_on_error( + &fed->socket, sizeof(int32_t) + 1, (unsigned char *)buffer, &rti_mutex, + "Failed to write port number to socket of federate %d.", fed_id); // Send the server IP address to federate. - write_to_socket_errexit(fed->socket, sizeof(remote_fed->server_ip_addr), - (unsigned char *)&remote_fed->server_ip_addr, - "Failed to write ip address to socket of federate %d.", fed_id); - - if (remote_fed->server_port != -1) { - LF_PRINT_DEBUG("Replied to address query from federate %d with address %s:%d.", - fed_id, remote_fed->server_hostname, remote_fed->server_port); - } + write_to_socket_fail_on_error( + &fed->socket, sizeof(remote_fed->server_ip_addr), + (unsigned char *)&remote_fed->server_ip_addr, &rti_mutex, + "Failed to write ip address to socket of federate %d.", fed_id); + LF_MUTEX_UNLOCK(rti_mutex); + + LF_PRINT_DEBUG("Replied to address query from federate %d with address %s:%d.", + fed_id, remote_fed->server_hostname, remote_fed->server_port); } void handle_address_ad(uint16_t federate_id) { @@ -757,34 +816,28 @@ void handle_address_ad(uint16_t federate_id) { // connections to other federates int32_t server_port = -1; unsigned char buffer[sizeof(int32_t)]; - ssize_t bytes_read = read_from_socket(fed->socket, sizeof(int32_t), (unsigned char *)buffer); - - if (bytes_read < (ssize_t)sizeof(int32_t)) { - LF_PRINT_DEBUG("Error reading port data from federate %d.", federate_id); - // Leave the server port at -1, which means "I don't know". - return; - } + read_from_socket_fail_on_error(&fed->socket, sizeof(int32_t), (unsigned char *)buffer, NULL, + "Error reading port data from federate %d.", federate_id); server_port = extract_int32(buffer); assert(server_port < 65536); - lf_mutex_lock(&rti_mutex); + LF_MUTEX_LOCK(rti_mutex); fed->server_port = server_port; + LF_MUTEX_UNLOCK(rti_mutex); + + LF_PRINT_LOG("Received address advertisement with port %d from federate %d.", server_port, federate_id); if (rti_remote->base.tracing_enabled) { tracepoint_rti_from_federate(rti_remote->base.trace, receive_ADR_AD, federate_id, NULL); } - LF_PRINT_LOG("Received address advertisement from federate %d.", federate_id); - lf_mutex_unlock(&rti_mutex); } void handle_timestamp(federate_info_t *my_fed) { unsigned char buffer[sizeof(int64_t)]; // Read bytes from the socket. We need 8 bytes. - ssize_t bytes_read = read_from_socket(my_fed->socket, sizeof(int64_t), (unsigned char*)&buffer); - if (bytes_read < (ssize_t)sizeof(int64_t)) { - lf_print_error("ERROR reading timestamp from federate %d.\n", my_fed->enclave.id); - } + read_from_socket_fail_on_error(&my_fed->socket, sizeof(int64_t), (unsigned char *)&buffer, NULL, + "ERROR reading timestamp from federate %d.\n", my_fed->enclave.id); int64_t timestamp = swap_bytes_if_big_endian_int64(*((int64_t *)(&buffer))); if (rti_remote->base.tracing_enabled) { @@ -793,7 +846,7 @@ void handle_timestamp(federate_info_t *my_fed) { } LF_PRINT_DEBUG("RTI received timestamp message with time: " PRINTF_TIME ".", timestamp); - lf_mutex_lock(&rti_mutex); + LF_MUTEX_LOCK(rti_mutex); rti_remote->num_feds_proposed_start++; if (timestamp > rti_remote->max_start_time) { rti_remote->max_start_time = timestamp; @@ -810,7 +863,7 @@ void handle_timestamp(federate_info_t *my_fed) { } } - lf_mutex_unlock(&rti_mutex); + LF_MUTEX_UNLOCK(rti_mutex); // Send back to the federate the maximum time plus an offset on a TIMESTAMP // message. @@ -824,28 +877,24 @@ void handle_timestamp(federate_info_t *my_fed) { tag_t tag = {.time = start_time, .microstep = 0}; tracepoint_rti_to_federate(rti_remote->base.trace, send_TIMESTAMP, my_fed->enclave.id, &tag); } - ssize_t bytes_written = write_to_socket( - my_fed->socket, MSG_TYPE_TIMESTAMP_LENGTH, - start_time_buffer - ); - if (bytes_written < MSG_TYPE_TIMESTAMP_LENGTH) { + if (write_to_socket(my_fed->socket, MSG_TYPE_TIMESTAMP_LENGTH, start_time_buffer)) { lf_print_error("Failed to send the starting time to federate %d.", my_fed->enclave.id); } - lf_mutex_lock(&rti_mutex); + LF_MUTEX_LOCK(rti_mutex); // Update state for the federate to indicate that the MSG_TYPE_TIMESTAMP // message has been sent. That MSG_TYPE_TIMESTAMP message grants time advance to // the federate to the start time. my_fed->enclave.state = GRANTED; lf_cond_broadcast(&sent_start_time); LF_PRINT_LOG("RTI sent start time " PRINTF_TIME " to federate %d.", start_time, my_fed->enclave.id); - lf_mutex_unlock(&rti_mutex); + LF_MUTEX_UNLOCK(rti_mutex); } -void send_physical_clock(unsigned char message_type, federate_info_t* fed, socket_type_t socket_type) { +void send_physical_clock(unsigned char message_type, federate_info_t *fed, socket_type_t socket_type) { if (fed->enclave.state == NOT_CONNECTED) { lf_print_warning("Clock sync: RTI failed to send physical time to federate %d. Socket not connected.\n", - fed->enclave.id); + fed->enclave.id); return; } unsigned char buffer[sizeof(int64_t) + 1]; @@ -858,29 +907,32 @@ void send_physical_clock(unsigned char message_type, federate_info_t* fed, socke // FIXME: UDP_addr is never initialized. LF_PRINT_DEBUG("Clock sync: RTI sending UDP message type %u.", buffer[0]); ssize_t bytes_written = sendto(rti_remote->socket_descriptor_UDP, buffer, 1 + sizeof(int64_t), 0, - (struct sockaddr*)&fed->UDP_addr, sizeof(fed->UDP_addr)); + (struct sockaddr *)&fed->UDP_addr, sizeof(fed->UDP_addr)); if (bytes_written < (ssize_t)sizeof(int64_t) + 1) { lf_print_warning("Clock sync: RTI failed to send physical time to federate %d: %s\n", - fed->enclave.id, - strerror(errno)); + fed->enclave.id, + strerror(errno)); return; } - } else if (socket_type == TCP) { + } + else if (socket_type == TCP) { LF_PRINT_DEBUG("Clock sync: RTI sending TCP message type %u.", buffer[0]); - write_to_socket_errexit(fed->socket, 1 + sizeof(int64_t), buffer, - "Clock sync: RTI failed to send physical time to federate %d: %s.", - fed->enclave.id, - strerror(errno)); - } - LF_PRINT_DEBUG("Clock sync: RTI sent PHYSICAL_TIME_SYNC_MESSAGE with timestamp " PRINTF_TIME " to federate %d.", - current_physical_time, - fed->enclave.id); + LF_MUTEX_LOCK(rti_mutex); + write_to_socket_fail_on_error(&fed->socket, 1 + sizeof(int64_t), buffer, &rti_mutex, + "Clock sync: RTI failed to send physical time to federate %d.", + fed->enclave.id); + LF_MUTEX_UNLOCK(rti_mutex); + } + LF_PRINT_DEBUG("Clock sync: RTI sent PHYSICAL_TIME_SYNC_MESSAGE with timestamp " PRINTF_TIME + " to federate %d.", + current_physical_time, + fed->enclave.id); } -void handle_physical_clock_sync_message(federate_info_t* my_fed, socket_type_t socket_type) { +void handle_physical_clock_sync_message(federate_info_t *my_fed, socket_type_t socket_type) { // Lock the mutex to prevent interference between sending the two // coded probe messages. - lf_mutex_lock(&rti_mutex); + LF_MUTEX_LOCK(rti_mutex); // Reply with a T4 type message send_physical_clock(MSG_TYPE_CLOCK_SYNC_T4, my_fed, socket_type); // Send the corresponding coded probe immediately after, @@ -888,18 +940,18 @@ void handle_physical_clock_sync_message(federate_info_t* my_fed, socket_type_t s if (socket_type == UDP) { send_physical_clock(MSG_TYPE_CLOCK_SYNC_CODED_PROBE, my_fed, socket_type); } - lf_mutex_unlock(&rti_mutex); + LF_MUTEX_UNLOCK(rti_mutex); } -void* clock_synchronization_thread(void* noargs) { +void *clock_synchronization_thread(void *noargs) { // Wait until all federates have been notified of the start time. // FIXME: Use lf_ version of this when merged with master. - lf_mutex_lock(&rti_mutex); + LF_MUTEX_LOCK(rti_mutex); while (rti_remote->num_feds_proposed_start < rti_remote->base.number_of_scheduling_nodes) { lf_cond_wait(&received_start_times); } - lf_mutex_unlock(&rti_mutex); + LF_MUTEX_UNLOCK(rti_mutex); // Wait until the start time before starting clock synchronization. // The above wait ensures that start_time has been set. @@ -911,8 +963,8 @@ void* clock_synchronization_thread(void* noargs) { // Initiate a clock synchronization every rti->clock_sync_period_ns // Initiate a clock synchronization every rti->clock_sync_period_ns - struct timespec sleep_time = {(time_t) rti_remote->clock_sync_period_ns / BILLION, - rti_remote->clock_sync_period_ns % BILLION}; + struct timespec sleep_time = {(time_t)rti_remote->clock_sync_period_ns / BILLION, + rti_remote->clock_sync_period_ns % BILLION}; struct timespec remaining_time; bool any_federates_connected = true; @@ -921,7 +973,7 @@ void* clock_synchronization_thread(void* noargs) { lf_sleep(rti_remote->clock_sync_period_ns); // Can be interrupted any_federates_connected = false; for (int fed_id = 0; fed_id < rti_remote->base.number_of_scheduling_nodes; fed_id++) { - federate_info_t* fed = GET_FED_INFO(fed_id); + federate_info_t *fed = GET_FED_INFO(fed_id); if (fed->enclave.state == NOT_CONNECTED) { // FIXME: We need better error handling here, but clock sync failure // should not stop execution. @@ -944,17 +996,17 @@ void* clock_synchronization_thread(void* noargs) { int remaining_attempts = 5; while (remaining_attempts > 0) { remaining_attempts--; - int bytes_read = read_from_socket(rti_remote->socket_descriptor_UDP, message_size, buffer); + int read_failed = read_from_socket(rti_remote->socket_descriptor_UDP, message_size, buffer); // If any errors occur, either discard the message or the clock sync round. - if (bytes_read == message_size) { + if (!read_failed) { if (buffer[0] == MSG_TYPE_CLOCK_SYNC_T3) { int32_t fed_id_2 = extract_int32(&(buffer[1])); // Check that this message came from the correct federate. if (fed_id_2 != fed->enclave.id) { // Message is from the wrong federate. Discard the message. lf_print_warning("Clock sync: Received T3 message from federate %d, " - "but expected one from %d. Discarding message.", - fed_id_2, fed->enclave.id); + "but expected one from %d. Discarding message.", + fed_id_2, fed->enclave.id); continue; } LF_PRINT_DEBUG("Clock sync: RTI received T3 message from federate %d.", fed_id_2); @@ -964,7 +1016,8 @@ void* clock_synchronization_thread(void* noargs) { // The message is not a T3 message. Discard the message and // continue waiting for the T3 message. This is possibly a message // from a previous cycle that was discarded. - lf_print_warning("Clock sync: Unexpected UDP message %u. Expected %u from federate %d. " + lf_print_warning( + "Clock sync: Unexpected UDP message %u. Expected %u from federate %d. " "Discarding message.", buffer[0], MSG_TYPE_CLOCK_SYNC_T3, @@ -987,21 +1040,73 @@ void* clock_synchronization_thread(void* noargs) { return NULL; } -void handle_federate_resign(federate_info_t *my_fed) { +/** + * Handle MSG_TYPE_FAILED sent by a federate. This message is sent by a federate + * that is exiting in failure. In this case, the RTI will + * also terminate abnormally, returning a non-zero exit code when it exits. + * + * This function assumes the caller does not hold the mutex. + * + * @param my_fed The federate sending a MSG_TYPE_FAILED message. + */ +static void handle_federate_failed(federate_info_t *my_fed) { // Nothing more to do. Close the socket and exit. - lf_mutex_lock(&rti_mutex); + LF_MUTEX_LOCK(rti_mutex); + if (rti_remote->base.tracing_enabled) { - // Extract the tag, for tracing purposes - size_t header_size = 1 + sizeof(tag_t); - unsigned char buffer[header_size]; - // Read the header, minus the first byte which has already been read. - read_from_socket_errexit(my_fed->socket, header_size - 1, &(buffer[1]), - "RTI failed to read the timed message header from remote federate."); - // Extract the tag sent by the resigning federate - tag_t tag = extract_tag(&(buffer[1])); - tracepoint_rti_from_federate(rti_remote->base.trace, receive_RESIGN, my_fed->enclave.id, &tag); + tracepoint_rti_from_federate(rti_remote->base.trace, receive_FAILED, my_fed->enclave.id, NULL); } + // Set the flag telling the RTI to exit with an error code when it exits. + _lf_federate_reports_error = true; + lf_print_error("RTI: Federate %d reports an error and has exited.", my_fed->enclave.id); + + my_fed->enclave.state = NOT_CONNECTED; + + // Indicate that there will no further events from this federate. + my_fed->enclave.next_event = FOREVER_TAG; + + // According to this: https://stackoverflow.com/questions/4160347/close-vs-shutdown-socket, + // the close should happen when receiving a 0 length message from the other end. + // Here, we just signal the other side that no further writes to the socket are + // forthcoming, which should result in the other end getting a zero-length reception. + shutdown(my_fed->socket, SHUT_RDWR); + + // We can now safely close the socket. + close(my_fed->socket); // from unistd.h + + // Check downstream federates to see whether they should now be granted a TAG. + // To handle cycles, need to create a boolean array to keep + // track of which upstream federates have been visited. + bool *visited = (bool *)calloc(rti_remote->base.number_of_scheduling_nodes, sizeof(bool)); // Initializes to 0. + notify_downstream_advance_grant_if_safe(&(my_fed->enclave), visited); + free(visited); + + LF_MUTEX_UNLOCK(rti_mutex); +} + +/** + * Handle MSG_TYPE_RESIGN sent by a federate. This message is sent at the time of termination + * after all shutdown events are processed on the federate. + * + * This function assumes the caller does not hold the mutex. + * + * @note At this point, the RTI might have outgoing messages to the federate. This + * function thus first performs a shutdown on the socket, which sends an EOF. It then + * waits for the remote socket to be closed before closing the socket itself. + * + * @param my_fed The federate sending a MSG_TYPE_RESIGN message. + */ +static void handle_federate_resign(federate_info_t *my_fed) { + // Nothing more to do. Close the socket and exit. + LF_MUTEX_LOCK(rti_mutex); + + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(rti_remote->base.trace, receive_RESIGN, my_fed->enclave.id, NULL); + } + + lf_print("RTI: Federate %d has resigned.", my_fed->enclave.id); + my_fed->enclave.state = NOT_CONNECTED; // Indicate that there will no further events from this federate. @@ -1012,24 +1117,28 @@ void handle_federate_resign(federate_info_t *my_fed) { // Here, we just signal the other side that no further writes to the socket are // forthcoming, which should result in the other end getting a zero-length reception. shutdown(my_fed->socket, SHUT_WR); - // Do not close because this results in an error on the other side rather than - // an orderly shutdown. - // close(my_fed->socket); // from unistd.h - lf_print("Federate %d has resigned.", my_fed->enclave.id); + // Wait for the federate to send an EOF or a socket error to occur. + // Discard any incoming bytes. Normally, this read should return 0 because + // the federate is resigning and should itself invoke shutdown. + unsigned char buffer[10]; + while (read(my_fed->socket, buffer, 10) > 0); + + // We can now safely close the socket. + close(my_fed->socket); // from unistd.h // Check downstream federates to see whether they should now be granted a TAG. // To handle cycles, need to create a boolean array to keep // track of which upstream federates have been visited. - bool* visited = (bool*)calloc(rti_remote->base.number_of_scheduling_nodes, sizeof(bool)); // Initializes to 0. + bool *visited = (bool *)calloc(rti_remote->base.number_of_scheduling_nodes, sizeof(bool)); // Initializes to 0. notify_downstream_advance_grant_if_safe(&(my_fed->enclave), visited); free(visited); - lf_mutex_unlock(&rti_mutex); + LF_MUTEX_UNLOCK(rti_mutex); } -void* federate_info_thread_TCP(void* fed) { - federate_info_t* my_fed = (federate_info_t*)fed; +void *federate_info_thread_TCP(void *fed) { + federate_info_t *my_fed = (federate_info_t *)fed; // Buffer for incoming messages. // This does not constrain the message size because messages @@ -1039,8 +1148,8 @@ void* federate_info_thread_TCP(void* fed) { // Listen for messages from the federate. while (my_fed->enclave.state != NOT_CONNECTED) { // Read no more than one byte to get the message type. - ssize_t bytes_read = read_from_socket(my_fed->socket, 1, buffer); - if (bytes_read < 1) { + int read_failed = read_from_socket(my_fed->socket, 1, buffer); + if (read_failed) { // Socket is closed lf_print_warning("RTI: Socket to federate %d is closed. Exiting the thread.", my_fed->enclave.id); my_fed->enclave.state = NOT_CONNECTED; @@ -1049,80 +1158,104 @@ void* federate_info_thread_TCP(void* fed) { break; } LF_PRINT_DEBUG("RTI: Received message type %u from federate %d.", buffer[0], my_fed->enclave.id); - switch(buffer[0]) { - case MSG_TYPE_TIMESTAMP: - handle_timestamp(my_fed); - break; - case MSG_TYPE_ADDRESS_QUERY: - handle_address_query(my_fed->enclave.id); - break; - case MSG_TYPE_ADDRESS_ADVERTISEMENT: - handle_address_ad(my_fed->enclave.id); - break; - case MSG_TYPE_TAGGED_MESSAGE: - handle_timed_message(my_fed, buffer); - break; - case MSG_TYPE_RESIGN: - handle_federate_resign(my_fed); - return NULL; - break; - case MSG_TYPE_NEXT_EVENT_TAG: - handle_next_event_tag(my_fed); - break; - case MSG_TYPE_LOGICAL_TAG_COMPLETE: - handle_logical_tag_complete(my_fed); - break; - case MSG_TYPE_STOP_REQUEST: - handle_stop_request_message(my_fed); // FIXME: Reviewed until here. - // Need to also look at - // notify_advance_grant_if_safe() - // and notify_downstream_advance_grant_if_safe() - break; - case MSG_TYPE_STOP_REQUEST_REPLY: - handle_stop_request_reply(my_fed); - break; - case MSG_TYPE_PORT_ABSENT: - handle_port_absent_message(my_fed, buffer); - break; - default: - lf_print_error("RTI received from federate %d an unrecognized TCP message type: %u.", my_fed->enclave.id, buffer[0]); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(rti_remote->base.trace, receive_UNIDENTIFIED, my_fed->enclave.id, NULL); - } + switch (buffer[0]) { + case MSG_TYPE_TIMESTAMP: + handle_timestamp(my_fed); + break; + case MSG_TYPE_ADDRESS_QUERY: + handle_address_query(my_fed->enclave.id); + break; + case MSG_TYPE_ADDRESS_ADVERTISEMENT: + handle_address_ad(my_fed->enclave.id); + break; + case MSG_TYPE_TAGGED_MESSAGE: + handle_timed_message(my_fed, buffer); + break; + case MSG_TYPE_RESIGN: + handle_federate_resign(my_fed); + return NULL; + case MSG_TYPE_NEXT_EVENT_TAG: + handle_next_event_tag(my_fed); + break; + case MSG_TYPE_LATEST_TAG_COMPLETE: + handle_latest_tag_complete(my_fed); + break; + case MSG_TYPE_STOP_REQUEST: + handle_stop_request_message(my_fed); // FIXME: Reviewed until here. + // Need to also look at + // notify_advance_grant_if_safe() + // and notify_downstream_advance_grant_if_safe() + break; + case MSG_TYPE_STOP_REQUEST_REPLY: + handle_stop_request_reply(my_fed); + break; + case MSG_TYPE_PORT_ABSENT: + handle_port_absent_message(my_fed, buffer); + break; + case MSG_TYPE_FAILED: + handle_federate_failed(my_fed); + return NULL; + default: + lf_print_error("RTI received from federate %d an unrecognized TCP message type: %u.", my_fed->enclave.id, buffer[0]); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(rti_remote->base.trace, receive_UNIDENTIFIED, my_fed->enclave.id, NULL); + } } } // Nothing more to do. Close the socket and exit. + // Prevent multiple threads from closing the same socket at the same time. + LF_MUTEX_LOCK(rti_mutex); close(my_fed->socket); // from unistd.h - + LF_MUTEX_UNLOCK(rti_mutex); return NULL; } -void send_reject(int socket_id, unsigned char error_code) { +void send_reject(int *socket_id, unsigned char error_code) { LF_PRINT_DEBUG("RTI sending MSG_TYPE_REJECT."); unsigned char response[2]; response[0] = MSG_TYPE_REJECT; response[1] = error_code; + LF_MUTEX_LOCK(rti_mutex); // NOTE: Ignore errors on this response. - write_to_socket_errexit(socket_id, 2, response, "RTI failed to write MSG_TYPE_REJECT message on the socket."); + if (write_to_socket(*socket_id, 2, response)) { + lf_print_warning("RTI failed to write MSG_TYPE_REJECT message on the socket."); + } // Close the socket. - close(socket_id); + shutdown(*socket_id, SHUT_RDWR); + close(*socket_id); + *socket_id = -1; + LF_MUTEX_UNLOCK(rti_mutex); } -int32_t receive_and_check_fed_id_message(int socket_id, struct sockaddr_in* client_fd) { +/** + * Listen for a MSG_TYPE_FED_IDS message, which includes as a payload + * a federate ID and a federation ID. If the federation ID + * matches this federation, send an MSG_TYPE_ACK and otherwise send + * a MSG_TYPE_REJECT message. + * @param socket_id Pointer to the socket on which to listen. + * @param client_fd The socket address. + * @return The federate ID for success or -1 for failure. + */ +static int32_t receive_and_check_fed_id_message(int *socket_id, struct sockaddr_in *client_fd) { // Buffer for message ID, federate ID, and federation ID length. size_t length = 1 + sizeof(uint16_t) + 1; // Message ID, federate ID, length of fedration ID. unsigned char buffer[length]; // Read bytes from the socket. We need 4 bytes. - // FIXME: This should not exit with error but rather should just reject the connection. - read_from_socket_errexit(socket_id, length, buffer, "RTI failed to read from accepted socket."); + if (read_from_socket_close_on_error(socket_id, length, buffer)) { + lf_print_error("RTI failed to read from accepted socket."); + return -1; + } uint16_t fed_id = rti_remote->base.number_of_scheduling_nodes; // Initialize to an invalid value. // First byte received is the message type. if (buffer[0] != MSG_TYPE_FED_IDS) { - if(buffer[0] == MSG_TYPE_P2P_SENDING_FED_ID || buffer[0] == MSG_TYPE_P2P_TAGGED_MESSAGE) { + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); + } + if (buffer[0] == MSG_TYPE_P2P_SENDING_FED_ID || buffer[0] == MSG_TYPE_P2P_TAGGED_MESSAGE) { // The federate is trying to connect to a peer, not to the RTI. // It has connected to the RTI instead. // FIXME: This should not happen, but apparently has been observed. @@ -1134,9 +1267,6 @@ int32_t receive_and_check_fed_id_message(int socket_id, struct sockaddr_in* clie } else { send_reject(socket_id, UNEXPECTED_MESSAGE); } - if (rti_remote->base.tracing_enabled){ - tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); - } lf_print_error("RTI expected a MSG_TYPE_FED_IDS message. Got %u (see net_common.h).", buffer[0]); return -1; } else { @@ -1148,10 +1278,11 @@ int32_t receive_and_check_fed_id_message(int socket_id, struct sockaddr_in* clie size_t federation_id_length = (size_t)buffer[sizeof(uint16_t) + 1]; char federation_id_received[federation_id_length + 1]; // One extra for null terminator. // Next read the actual federation ID. - // FIXME: This should not exit on error, but rather just reject the connection. - read_from_socket_errexit(socket_id, federation_id_length, - (unsigned char*)federation_id_received, - "RTI failed to read federation id from federate %d.", fed_id); + if (read_from_socket_close_on_error(socket_id, federation_id_length, + (unsigned char *)federation_id_received)) { + lf_print_error("RTI failed to read federation id from federate %d.", fed_id); + return -1; + } // Terminate the string with a null. federation_id_received[federation_id_length] = 0; @@ -1164,11 +1295,11 @@ int32_t receive_and_check_fed_id_message(int socket_id, struct sockaddr_in* clie // Compare the received federation ID to mine. if (strncmp(rti_remote->federation_id, federation_id_received, federation_id_length) != 0) { // Federation IDs do not match. Send back a MSG_TYPE_REJECT message. - lf_print_error("WARNING: Federate from another federation %s attempted to connect to RTI in federation %s.\n", + lf_print_warning("Federate from another federation %s attempted to connect to RTI in federation %s.", federation_id_received, rti_remote->federation_id); if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); + tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); } send_reject(socket_id, FEDERATION_ID_DOES_NOT_MATCH); return -1; @@ -1176,14 +1307,13 @@ int32_t receive_and_check_fed_id_message(int socket_id, struct sockaddr_in* clie if (fed_id >= rti_remote->base.number_of_scheduling_nodes) { // Federate ID is out of range. lf_print_error("RTI received federate ID %d, which is out of range.", fed_id); - if (rti_remote->base.tracing_enabled){ + if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); } send_reject(socket_id, FEDERATE_ID_OUT_OF_RANGE); return -1; } else { if ((rti_remote->base.scheduling_nodes[fed_id])->state != NOT_CONNECTED) { - lf_print_error("RTI received duplicate federate ID: %d.", fed_id); if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); @@ -1194,26 +1324,26 @@ int32_t receive_and_check_fed_id_message(int socket_id, struct sockaddr_in* clie } } } - federate_info_t* fed = GET_FED_INFO(fed_id); + federate_info_t *fed = GET_FED_INFO(fed_id); // The MSG_TYPE_FED_IDS message has the right federation ID. // Assign the address information for federate. // The IP address is stored here as an in_addr struct (in .server_ip_addr) that can be useful // to create sockets and can be efficiently sent over the network. // First, convert the sockaddr structure into a sockaddr_in that contains an internet address. - struct sockaddr_in* pV4_addr = client_fd; + struct sockaddr_in *pV4_addr = client_fd; // Then extract the internet address (which is in IPv4 format) and assign it as the federate's socket server fed->server_ip_addr = pV4_addr->sin_addr; #if LOG_LEVEL >= LOG_LEVEL_DEBUG // Create the human readable format and copy that into // the .server_hostname field of the federate. - char str[INET_ADDRSTRLEN]; - inet_ntop( AF_INET, &fed->server_ip_addr, str, INET_ADDRSTRLEN ); - strncpy (fed->server_hostname, str, INET_ADDRSTRLEN); + char str[INET_ADDRSTRLEN + 1]; + inet_ntop(AF_INET, &fed->server_ip_addr, str, INET_ADDRSTRLEN); + strncpy(fed->server_hostname, str, INET_ADDRSTRLEN); LF_PRINT_DEBUG("RTI got address %s from federate %d.", fed->server_hostname, fed_id); #endif - fed->socket = socket_id; + fed->socket = *socket_id; // Set the federate's state as pending // because it is waiting for the start time to be @@ -1226,105 +1356,136 @@ int32_t receive_and_check_fed_id_message(int socket_id, struct sockaddr_in* clie if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(rti_remote->base.trace, send_ACK, fed_id, NULL); } - write_to_socket_errexit(socket_id, 1, &ack_message, - "RTI failed to write MSG_TYPE_ACK message to federate %d.", fed_id); + LF_MUTEX_LOCK(rti_mutex); + if (write_to_socket_close_on_error(&fed->socket, 1, &ack_message)) { + LF_MUTEX_UNLOCK(rti_mutex); + lf_print_error("RTI failed to write MSG_TYPE_ACK message to federate %d.", fed_id); + return -1; + } + LF_MUTEX_UNLOCK(rti_mutex); + + LF_PRINT_DEBUG("RTI sent MSG_TYPE_ACK to federate %d.", fed_id); return (int32_t)fed_id; } -int receive_connection_information(int socket_id, uint16_t fed_id) { +/** + * Listen for a MSG_TYPE_NEIGHBOR_STRUCTURE message, and upon receiving it, fill + * out the relevant information in the federate's struct. + * @return 1 on success and 0 on failure. + */ +static int receive_connection_information(int *socket_id, uint16_t fed_id) { LF_PRINT_DEBUG("RTI waiting for MSG_TYPE_NEIGHBOR_STRUCTURE from federate %d.", fed_id); unsigned char connection_info_header[MSG_TYPE_NEIGHBOR_STRUCTURE_HEADER_SIZE]; - read_from_socket_errexit( - socket_id, - MSG_TYPE_NEIGHBOR_STRUCTURE_HEADER_SIZE, - connection_info_header, - "RTI failed to read MSG_TYPE_NEIGHBOR_STRUCTURE message header from federate %d.", - fed_id - ); + read_from_socket_fail_on_error( + socket_id, + MSG_TYPE_NEIGHBOR_STRUCTURE_HEADER_SIZE, + connection_info_header, + NULL, + "RTI failed to read MSG_TYPE_NEIGHBOR_STRUCTURE message header from federate %d.", + fed_id); if (connection_info_header[0] != MSG_TYPE_NEIGHBOR_STRUCTURE) { - lf_print_error("RTI was expecting a MSG_TYPE_UDP_PORT message from federate %d. Got %u instead. " - "Rejecting federate.", fed_id, connection_info_header[0]); + lf_print_error( + "RTI was expecting a MSG_TYPE_UDP_PORT message from federate %d. Got %u instead. " + "Rejecting federate.", + fed_id, connection_info_header[0]); send_reject(socket_id, UNEXPECTED_MESSAGE); return 0; } else { - federate_info_t* fed = GET_FED_INFO(fed_id); + federate_info_t *fed = GET_FED_INFO(fed_id); // Read the number of upstream and downstream connections fed->enclave.num_upstream = extract_int32(&(connection_info_header[1])); fed->enclave.num_downstream = extract_int32(&(connection_info_header[1 + sizeof(int32_t)])); LF_PRINT_DEBUG( - "RTI got %d upstreams and %d downstreams from federate %d.", - fed->enclave.num_upstream, - fed->enclave.num_downstream, - fed_id); + "RTI got %d upstreams and %d downstreams from federate %d.", + fed->enclave.num_upstream, + fed->enclave.num_downstream, + fed_id); // Allocate memory for the upstream and downstream pointers if (fed->enclave.num_upstream > 0) { - fed->enclave.upstream = (int*)malloc(sizeof(uint16_t) * fed->enclave.num_upstream); + fed->enclave.upstream = (int *)malloc(sizeof(uint16_t) * fed->enclave.num_upstream); // Allocate memory for the upstream delay pointers - fed->enclave.upstream_delay = - (interval_t*)malloc( - sizeof(interval_t) * fed->enclave.num_upstream - ); + fed->enclave.upstream_delay = (interval_t *)malloc( + sizeof(interval_t) * fed->enclave.num_upstream); } else { - fed->enclave.upstream = (int*)NULL; - fed->enclave.upstream_delay = (interval_t*)NULL; + fed->enclave.upstream = (int *)NULL; + fed->enclave.upstream_delay = (interval_t *)NULL; } if (fed->enclave.num_downstream > 0) { - fed->enclave.downstream = (int*)malloc(sizeof(uint16_t) * fed->enclave.num_downstream); + fed->enclave.downstream = (int *)malloc(sizeof(uint16_t) * fed->enclave.num_downstream); } else { - fed->enclave.downstream = (int*)NULL; + fed->enclave.downstream = (int *)NULL; } - size_t connections_info_body_size = ((sizeof(uint16_t) + sizeof(int64_t)) * - fed->enclave.num_upstream) + (sizeof(uint16_t) * fed->enclave.num_downstream); - unsigned char* connections_info_body = (unsigned char*)malloc(connections_info_body_size); - read_from_socket_errexit( - socket_id, - connections_info_body_size, - connections_info_body, - "RTI failed to read MSG_TYPE_NEIGHBOR_STRUCTURE message body from federate %d.", - fed_id - ); - - // Keep track of where we are in the buffer - size_t message_head = 0; - // First, read the info about upstream federates - for (int i=0; ienclave.num_upstream; i++) { - fed->enclave.upstream[i] = extract_uint16(&(connections_info_body[message_head])); - message_head += sizeof(uint16_t); - fed->enclave.upstream_delay[i] = extract_int64(&(connections_info_body[message_head])); - message_head += sizeof(int64_t); - } + size_t connections_info_body_size = ( + (sizeof(uint16_t) + sizeof(int64_t)) * fed->enclave.num_upstream) + + (sizeof(uint16_t) * fed->enclave.num_downstream); + unsigned char *connections_info_body = NULL; + if (connections_info_body_size > 0) { + connections_info_body = (unsigned char *)malloc(connections_info_body_size); + read_from_socket_fail_on_error( + socket_id, + connections_info_body_size, + connections_info_body, + NULL, + "RTI failed to read MSG_TYPE_NEIGHBOR_STRUCTURE message body from federate %d.", + fed_id); + // Keep track of where we are in the buffer + size_t message_head = 0; + // First, read the info about upstream federates + for (int i = 0; i < fed->enclave.num_upstream; i++) { + fed->enclave.upstream[i] = extract_uint16(&(connections_info_body[message_head])); + message_head += sizeof(uint16_t); + fed->enclave.upstream_delay[i] = extract_int64(&(connections_info_body[message_head])); + message_head += sizeof(int64_t); + } - // Next, read the info about downstream federates - for (int i=0; ienclave.num_downstream; i++) { - fed->enclave.downstream[i] = extract_uint16(&(connections_info_body[message_head])); - message_head += sizeof(uint16_t); - } + // Next, read the info about downstream federates + for (int i = 0; i < fed->enclave.num_downstream; i++) { + fed->enclave.downstream[i] = extract_uint16(&(connections_info_body[message_head])); + message_head += sizeof(uint16_t); + } - free(connections_info_body); - return 1; + free(connections_info_body); + } } + LF_PRINT_DEBUG("RTI received neighbor structure from federate %d.", fed_id); + return 1; } -int receive_udp_message_and_set_up_clock_sync(int socket_id, uint16_t fed_id) { +/** + * Listen for a MSG_TYPE_UDP_PORT message, and upon receiving it, set up + * clock synchronization and perform the initial clock synchronization. + * Initial clock synchronization is performed only if the MSG_TYPE_UDP_PORT message + * payload is not UINT16_MAX. If it is also not 0, then this function sets + * up to perform runtime clock synchronization using the UDP port number + * specified in the payload to communicate with the federate's clock + * synchronization logic. + * @param socket_id The socket on which to listen. + * @param fed_id The federate ID. + * @return 1 for success, 0 for failure. + */ +static int receive_udp_message_and_set_up_clock_sync(int *socket_id, uint16_t fed_id) { // Read the MSG_TYPE_UDP_PORT message from the federate regardless of the status of // clock synchronization. This message will tell the RTI whether the federate // is doing clock synchronization, and if it is, what port to use for UDP. LF_PRINT_DEBUG("RTI waiting for MSG_TYPE_UDP_PORT from federate %d.", fed_id); unsigned char response[1 + sizeof(uint16_t)]; - read_from_socket_errexit(socket_id, 1 + sizeof(uint16_t) , response, + read_from_socket_fail_on_error(socket_id, 1 + sizeof(uint16_t), response, NULL, "RTI failed to read MSG_TYPE_UDP_PORT message from federate %d.", fed_id); if (response[0] != MSG_TYPE_UDP_PORT) { - lf_print_error("RTI was expecting a MSG_TYPE_UDP_PORT message from federate %d. Got %u instead. " - "Rejecting federate.", fed_id, response[0]); + lf_print_error( + "RTI was expecting a MSG_TYPE_UDP_PORT message from federate %d. Got %u instead. " + "Rejecting federate.", + fed_id, response[0]); send_reject(socket_id, UNEXPECTED_MESSAGE); return 0; } else { federate_info_t *fed = GET_FED_INFO(fed_id); - if (rti_remote->clock_sync_global_status >= clock_sync_init) {// If no initial clock sync, no need perform initial clock sync. + if (rti_remote->clock_sync_global_status >= clock_sync_init) { + // If no initial clock sync, no need perform initial clock sync. uint16_t federate_UDP_port_number = extract_uint16(&(response[1])); LF_PRINT_DEBUG("RTI got MSG_TYPE_UDP_PORT %u from federate %d.", federate_UDP_port_number, fed_id); @@ -1333,14 +1494,14 @@ int receive_udp_message_and_set_up_clock_sync(int socket_id, uint16_t fed_id) { if (federate_UDP_port_number != UINT16_MAX) { // Perform the initialization clock synchronization with the federate. // Send the required number of messages for clock synchronization - for (int i=0; i < rti_remote->clock_sync_exchanges_per_interval; i++) { + for (int i = 0; i < rti_remote->clock_sync_exchanges_per_interval; i++) { // Send the RTI's current physical time T1 to the federate. send_physical_clock(MSG_TYPE_CLOCK_SYNC_T1, fed, TCP); // Listen for reply message, which should be T3. size_t message_size = 1 + sizeof(int32_t); unsigned char buffer[message_size]; - read_from_socket_errexit(socket_id, message_size, buffer, + read_from_socket_fail_on_error(socket_id, message_size, buffer, NULL, "Socket to federate %d unexpectedly closed.", fed_id); if (buffer[0] == MSG_TYPE_CLOCK_SYNC_T3) { int32_t fed_id = extract_int32(&(buffer[1])); @@ -1356,21 +1517,25 @@ int receive_udp_message_and_set_up_clock_sync(int socket_id, uint16_t fed_id) { } LF_PRINT_DEBUG("RTI finished initial clock synchronization with federate %d.", fed_id); } - if (rti_remote->clock_sync_global_status >= clock_sync_on) { // If no runtime clock sync, no need to set up the UDP port. - if (federate_UDP_port_number > 0) { - // Initialize the UDP_addr field of the federate struct - fed->UDP_addr.sin_family = AF_INET; - fed->UDP_addr.sin_port = htons(federate_UDP_port_number); - fed->UDP_addr.sin_addr = fed->server_ip_addr; - } + if (rti_remote->clock_sync_global_status >= clock_sync_on) { + // If no runtime clock sync, no need to set up the UDP port. + if (federate_UDP_port_number > 0) { + // Initialize the UDP_addr field of the federate struct + fed->UDP_addr.sin_family = AF_INET; + fed->UDP_addr.sin_port = htons(federate_UDP_port_number); + fed->UDP_addr.sin_addr = fed->server_ip_addr; + } } else { - // Disable clock sync after initial round. - fed->clock_synchronization_enabled = false; + // Disable clock sync after initial round. + fed->clock_synchronization_enabled = false; } - } else { // No clock synchronization at all. + } else { + // No clock synchronization at all. + LF_PRINT_DEBUG("RTI: No clock synchronization for federate %d.", fed_id); // Clock synchronization is universally disabled via the clock-sync command-line parameter // (-c off was passed to the RTI). - // Note that the federates are still going to send a MSG_TYPE_UDP_PORT message but with a payload (port) of -1. + // Note that the federates are still going to send a + // MSG_TYPE_UDP_PORT message but with a payload (port) of -1. fed->clock_synchronization_enabled = false; } } @@ -1378,16 +1543,22 @@ int receive_udp_message_and_set_up_clock_sync(int socket_id, uint16_t fed_id) { } #ifdef __RTI_AUTH__ -bool authenticate_federate(int socket) { +/** + * Authenticate incoming federate by performing HMAC-based authentication. + * + * @param socket Socket for the incoming federate tryting to authenticate. + * @return True if authentication is successful and false otherwise. + */ +static bool authenticate_federate(int *socket) { // Wait for MSG_TYPE_FED_NONCE from federate. size_t fed_id_length = sizeof(uint16_t); unsigned char buffer[1 + fed_id_length + NONCE_LENGTH]; - read_from_socket_errexit(socket, 1 + fed_id_length + NONCE_LENGTH, buffer, - "Failed to read MSG_TYPE_FED_NONCE"); + read_from_socket_fail_on_error(socket, 1 + fed_id_length + NONCE_LENGTH, buffer, NULL, + "Failed to read MSG_TYPE_FED_NONCE"); if (buffer[0] != MSG_TYPE_FED_NONCE) { lf_print_error_and_exit( - "Received unexpected response %u from the FED (see net_common.h).", - buffer[0]); + "Received unexpected response %u from the FED (see net_common.h).", + buffer[0]); } unsigned int hmac_length = SHA256_HMAC_LENGTH; size_t federation_id_length = strnlen(rti_remote->federation_id, 255); @@ -1397,9 +1568,9 @@ bool authenticate_federate(int socket) { memcpy(&mac_buf[1], &buffer[1], fed_id_length); memcpy(&mac_buf[1 + fed_id_length], &buffer[1 + fed_id_length], NONCE_LENGTH); unsigned char hmac_tag[hmac_length]; - unsigned char * ret = HMAC(EVP_sha256(), rti_remote->federation_id, - federation_id_length, mac_buf, 1 + fed_id_length + NONCE_LENGTH, - hmac_tag, &hmac_length); + unsigned char *ret = HMAC(EVP_sha256(), rti_remote->federation_id, + federation_id_length, mac_buf, 1 + fed_id_length + NONCE_LENGTH, + hmac_tag, &hmac_length); if (ret == NULL) { lf_print_error_and_exit("HMAC construction failed for MSG_TYPE_RTI_RESPONSE."); } @@ -1410,12 +1581,14 @@ bool authenticate_federate(int socket) { RAND_bytes(rti_nonce, NONCE_LENGTH); memcpy(&sender[1], rti_nonce, NONCE_LENGTH); memcpy(&sender[1 + NONCE_LENGTH], hmac_tag, hmac_length); - write_to_socket(socket, 1 + NONCE_LENGTH + hmac_length, sender); + if (write_to_socket(*socket, 1 + NONCE_LENGTH + hmac_length, sender)) { + lf_print_error("Failed to send nonce to federate."); + } // Wait for MSG_TYPE_FED_RESPONSE unsigned char received[1 + hmac_length]; - read_from_socket_errexit(socket, 1 + hmac_length, received, - "Failed to read federate response."); + read_from_socket_fail_on_error(socket, 1 + hmac_length, received, NULL, + "Failed to read federate response."); if (received[0] != MSG_TYPE_FED_RESPONSE) { lf_print_error_and_exit( "Received unexpected response %u from the federate (see net_common.h).", @@ -1428,7 +1601,7 @@ bool authenticate_federate(int socket) { memcpy(&mac_buf2[1], rti_nonce, NONCE_LENGTH); unsigned char rti_tag[hmac_length]; ret = HMAC(EVP_sha256(), rti_remote->federation_id, federation_id_length, - mac_buf2, 1 + NONCE_LENGTH, rti_tag, &hmac_length); + mac_buf2, 1 + NONCE_LENGTH, rti_tag, &hmac_length); if (ret == NULL) { lf_print_error_and_exit("HMAC construction failed for MSG_TYPE_FED_RESPONSE."); } @@ -1445,20 +1618,20 @@ bool authenticate_federate(int socket) { } #endif -void connect_to_federates(int socket_descriptor) { +void lf_connect_to_federates(int socket_descriptor) { for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { // Wait for an incoming connection request. struct sockaddr client_fd; uint32_t client_length = sizeof(client_fd); // The following blocks until a federate connects. int socket_id = -1; - while(1) { + while (1) { socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); if (socket_id >= 0) { // Got a socket break; } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { - lf_print_error_and_exit("RTI failed to accept the socket. %s.", strerror(errno)); + lf_print_error_system_failure("RTI failed to accept the socket."); } else { // Try again lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); @@ -1466,23 +1639,27 @@ void connect_to_federates(int socket_descriptor) { } } - // Wait for the first message from the federate when RTI -a option is on. - #ifdef __RTI_AUTH__ +// Wait for the first message from the federate when RTI -a option is on. +#ifdef __RTI_AUTH__ if (rti_remote->authentication_enabled) { - if (!authenticate_federate(socket_id)) { + if (!authenticate_federate(&socket_id)) { lf_print_warning("RTI failed to authenticate the incoming federate."); + // Close the socket. + shutdown(socket_id, SHUT_RDWR); + close(socket_id); + socket_id = -1; // Ignore the federate that failed authentication. i--; continue; } } - #endif - +#endif + // The first message from the federate should contain its ID and the federation ID. - int32_t fed_id = receive_and_check_fed_id_message(socket_id, (struct sockaddr_in*)&client_fd); - if (fed_id >= 0 - && receive_connection_information(socket_id, (uint16_t)fed_id) - && receive_udp_message_and_set_up_clock_sync(socket_id, (uint16_t)fed_id)) { + int32_t fed_id = receive_and_check_fed_id_message(&socket_id, (struct sockaddr_in *)&client_fd); + if (fed_id >= 0 && socket_id >= 0 + && receive_connection_information(&socket_id, (uint16_t)fed_id) + && receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { // Create a thread to communicate with the federate. // This has to be done after clock synchronization is finished @@ -1490,7 +1667,6 @@ void connect_to_federates(int socket_descriptor) { // synchronization messages. federate_info_t *fed = GET_FED_INFO(fed_id); lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); - } else { // Received message was rejected. Try again. i--; @@ -1505,7 +1681,7 @@ void connect_to_federates(int socket_descriptor) { // federate is performing runtime clock synchronization. bool clock_sync_enabled = false; for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed_info = GET_FED_INFO(i); + federate_info_t *fed_info = GET_FED_INFO(i); if (fed_info->clock_synchronization_enabled) { clock_sync_enabled = true; break; @@ -1517,7 +1693,7 @@ void connect_to_federates(int socket_descriptor) { } } -void* respond_to_erroneous_connections(void* nothing) { +void *respond_to_erroneous_connections(void *nothing) { while (true) { // Wait for an incoming connection request. struct sockaddr client_fd; @@ -1536,46 +1712,43 @@ void* respond_to_erroneous_connections(void* nothing) { response[0] = MSG_TYPE_REJECT; response[1] = FEDERATION_ID_DOES_NOT_MATCH; // Ignore errors on this response. - write_to_socket_errexit(socket_id, 2, response, - "RTI failed to write FEDERATION_ID_DOES_NOT_MATCH to erroneous incoming connection."); + if (write_to_socket(socket_id, 2, response)) { + lf_print_warning("RTI failed to write FEDERATION_ID_DOES_NOT_MATCH to erroneous incoming connection."); + } // Close the socket. + shutdown(socket_id, SHUT_RDWR); close(socket_id); } return NULL; } -void initialize_federate(federate_info_t* fed, uint16_t id) { +void initialize_federate(federate_info_t *fed, uint16_t id) { initialize_scheduling_node(&(fed->enclave), id); fed->requested_stop = false; - fed->socket = -1; // No socket. + fed->socket = -1; // No socket. fed->clock_synchronization_enabled = true; - fed->in_transit_message_tags = initialize_in_transit_message_q(); - strncpy(fed->server_hostname ,"localhost", INET_ADDRSTRLEN); + fed->in_transit_message_tags = pqueue_tag_init(10); + strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); fed->server_ip_addr.s_addr = 0; fed->server_port = -1; } int32_t start_rti_server(uint16_t port) { - int32_t specified_port = port; - if (port == 0) { - // Use the default starting port. - port = STARTING_PORT; - } _lf_initialize_clock(); // Create the TCP socket server - rti_remote->socket_descriptor_TCP = create_server(specified_port, port, TCP); + rti_remote->socket_descriptor_TCP = create_rti_server(port, TCP); lf_print("RTI: Listening for federates."); // Create the UDP socket server - // Try to get the rti->final_port_TCP + 1 port + // Try to get the rti_remote->final_port_TCP + 1 port if (rti_remote->clock_sync_global_status >= clock_sync_on) { - rti_remote->socket_descriptor_UDP = create_server(specified_port, rti_remote->final_port_TCP + 1, UDP); + rti_remote->socket_descriptor_UDP = create_rti_server(rti_remote->final_port_TCP + 1, UDP); } return rti_remote->socket_descriptor_TCP; } void wait_for_federates(int socket_descriptor) { // Wait for connections from federates and create a thread for each. - connect_to_federates(socket_descriptor); + lf_connect_to_federates(socket_descriptor); // All federates have connected. lf_print("RTI: All expected federates have connected. Starting execution."); @@ -1588,20 +1761,20 @@ void wait_for_federates(int socket_descriptor) { lf_thread_create(&responder_thread, respond_to_erroneous_connections, NULL); // Wait for federate threads to exit. - void* thread_exit_status; + void *thread_exit_status; for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); + federate_info_t *fed = GET_FED_INFO(i); lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); lf_thread_join(fed->thread_id, &thread_exit_status); - free_in_transit_message_q(fed->in_transit_message_tags); + pqueue_tag_free(fed->in_transit_message_tags); lf_print("RTI: Federate %d thread exited.", fed->enclave.id); } rti_remote->all_federates_exited = true; - // Shutdown and close the socket so that the accept() call in - // respond_to_erroneous_connections returns. That thread should then - // check rti->all_federates_exited and it should exit. + // Shutdown and close the socket that is listening for incoming connections + // so that the accept() call in respond_to_erroneous_connections returns. + // That thread should then check rti->all_federates_exited and it should exit. if (shutdown(socket_descriptor, SHUT_RDWR)) { LF_PRINT_LOG("On shut down TCP socket, received reply: %s", strerror(errno)); } @@ -1620,14 +1793,13 @@ void wait_for_federates(int socket_descriptor) { } } - -void initialize_RTI(rti_remote_t *rti){ +void initialize_RTI(rti_remote_t *rti) { rti_remote = rti; - // Initialize thread synchronization primitives - LF_ASSERT(lf_mutex_init(&rti_mutex) == 0, "Failed to initialize Mutex"); - LF_ASSERT(lf_cond_init(&received_start_times, &rti_mutex) == 0, "Failed to initialize Condition Variable"); - LF_ASSERT(lf_cond_init(&sent_start_time, &rti_mutex) == 0, "Failed to initialize Condition Variable"); + // Initialize thread synchronization primitives + LF_MUTEX_INIT(rti_mutex); + LF_COND_INIT(received_start_times, rti_mutex); + LF_COND_INIT(sent_start_time, rti_mutex); initialize_rti_common(&rti_remote->base); rti_remote->base.mutex = &rti_mutex; @@ -1650,12 +1822,14 @@ void initialize_RTI(rti_remote_t *rti){ rti_remote->stop_in_progress = false; } -void free_scheduling_nodes(scheduling_node_t** scheduling_nodes, uint16_t number_of_scheduling_nodes) { +void free_scheduling_nodes(scheduling_node_t **scheduling_nodes, uint16_t number_of_scheduling_nodes) { for (uint16_t i = 0; i < number_of_scheduling_nodes; i++) { // FIXME: Gives error freeing memory not allocated!!!! - scheduling_node_t* node = scheduling_nodes[i]; - if (node->upstream != NULL) free(node->upstream); - if (node->downstream != NULL) free(node->downstream); + scheduling_node_t *node = scheduling_nodes[i]; + if (node->upstream != NULL) + free(node->upstream); + if (node->downstream != NULL) + free(node->downstream); } free(scheduling_nodes); } diff --git a/core/federated/RTI/rti_remote.h b/core/federated/RTI/rti_remote.h index b3249ec30..9303da42d 100644 --- a/core/federated/RTI/rti_remote.h +++ b/core/federated/RTI/rti_remote.h @@ -30,7 +30,10 @@ #endif #include "lf_types.h" -#include "message_record/message_record.h" +#include "pqueue_tag.h" + +/** Time allowed for federates to reply to stop request. */ +#define MAX_TIME_FOR_REPLY_TO_STOP_REQUEST SEC(30) ///////////////////////////////////////////// //// Data structures @@ -58,9 +61,9 @@ typedef struct federate_info_t { struct sockaddr_in UDP_addr; // The UDP address for the federate. bool clock_synchronization_enabled; // Indicates the status of clock synchronization // for this federate. Enabled by default. - in_transit_message_record_q_t* in_transit_message_tags; // Record of in-transit messages to this federate that are not - // yet processed. This record is ordered based on the time - // value of each message for a more efficient access. + pqueue_tag_t* in_transit_message_tags; // Record of in-transit messages to this federate that are not + // yet processed. This record is ordered based on the time + // value of each message for a more efficient access. char server_hostname[INET_ADDRSTRLEN]; // Human-readable IP address and int32_t server_port; // port number of the socket server of the federate // if it has any incoming direct connections from other federates. @@ -185,17 +188,9 @@ extern int lf_critical_section_enter(environment_t* env); extern int lf_critical_section_exit(environment_t* env); /** - * Create a server and enable listening for socket connections. - * - * @note This function is similar to create_server(...) in - * federate.c. However, it contains logs that are specific - * to the RTI. - * - * @param port The port number to use. - * @param socket_type The type of the socket for the server (TCP or UDP). - * @return The socket descriptor on which to accept connections. + * Indicator that one or more federates have reported an error on resigning. */ -int create_server(int32_t specified_port, uint16_t port, socket_type_t socket_type); +extern bool _lf_federate_reports_error; /** * @brief Update the next event tag of federate `federate_id`. @@ -231,14 +226,14 @@ void handle_port_absent_message(federate_info_t* sending_federate, unsigned char void handle_timed_message(federate_info_t* sending_federate, unsigned char* buffer); /** - * Handle a logical tag complete (LTC) message. @see - * MSG_TYPE_LOGICAL_TAG_COMPLETE in rti.h. + * Handle a latest tag complete (LTC) message. @see + * MSG_TYPE_LATEST_TAG_COMPLETE in rti.h. * * This function assumes the caller does not hold the mutex. * * @param fed The federate that has completed a logical tag. */ -void handle_logical_tag_complete(federate_info_t* fed); +void handle_latest_tag_complete(federate_info_t* fed); /** * Handle a next event tag (NET) message. @see MSG_TYPE_NEXT_EVENT_TAG in rti.h. @@ -250,18 +245,6 @@ void handle_logical_tag_complete(federate_info_t* fed); void handle_next_event_tag(federate_info_t* fed); /////////////////// STOP functions //////////////////// -/** - * Mark a federate requesting stop. - * - * If the number of federates handling stop reaches the - * NUM_OF_FEDERATES, broadcast MSG_TYPE_STOP_GRANTED to every federate. - * - * This function assumes the _RTI.mutex is already locked. - * - * @param fed The federate that has requested a stop or has suddenly - * stopped (disconnected). - */ -void mark_federate_requesting_stop(federate_info_t* fed); /** * Handle a MSG_TYPE_STOP_REQUEST message. @@ -291,7 +274,7 @@ void handle_stop_request_reply(federate_info_t* fed); * are initialized to -1. If no MSG_TYPE_ADDRESS_ADVERTISEMENT message has been received from * the destination federate, the RTI will simply reply with -1 for the port. * The sending federate is responsible for checking back with the RTI after a - * period of time. @see connect_to_federate() in federate.c. * + * period of time. * @param fed_id The federate sending a MSG_TYPE_ADDRESS_QUERY message. */ void handle_address_query(uint16_t fed_id); @@ -303,7 +286,7 @@ void handle_address_query(uint16_t fed_id); * field of the _RTI.federates[federate_id] array of structs. * * The server_hostname and server_ip_addr fields are assigned - * in connect_to_federates() upon accepting the socket + * in lf_connect_to_federates() upon accepting the socket * from the remote federate. * * This function assumes the caller does not hold the mutex. @@ -359,34 +342,6 @@ void handle_physical_clock_sync_message(federate_info_t* my_fed, socket_type_t s */ void* clock_synchronization_thread(void* noargs); -/** - * A function to handle messages labeled - * as MSG_TYPE_RESIGN sent by a federate. This - * message is sent at the time of termination - * after all shutdown events are processed - * on the federate. - * - * This function assumes the caller does not hold the mutex. - * - * @note At this point, the RTI might have - * outgoing messages to the federate. This - * function thus first performs a shutdown - * on the socket which sends an EOF. It then - * waits for the remote socket to be closed - * before closing the socket itself. - * - * Assumptions: - * - We assume that the other side (the federates) - * are in charge of closing the socket (by calling - * close() on the socket), and then wait for the RTI - * to shutdown the socket. - * - We assume that calling shutdown() follows the same - * shutdown procedure as stated in the TCP/IP specification. - * - * @param my_fed The federate sending a MSG_TYPE_RESIGN message. - **/ -void handle_federate_resign(federate_info_t *my_fed); - /** * Thread handling TCP communication with a federate. * @param fed A pointer to the federate's struct that has the @@ -396,52 +351,10 @@ void* federate_info_thread_TCP(void* fed); /** * Send a MSG_TYPE_REJECT message to the specified socket and close the socket. - * @param socket_id The socket. + * @param socket_id Pointer to the socket ID. * @param error_code An error code. */ -void send_reject(int socket_id, unsigned char error_code); - -/** - * Listen for a MSG_TYPE_FED_IDS message, which includes as a payload - * a federate ID and a federation ID. If the federation ID - * matches this federation, send an MSG_TYPE_ACK and otherwise send - * a MSG_TYPE_REJECT message. Return 1 if the federate is accepted to - * the federation and 0 otherwise. - * @param socket_id The socket on which to listen. - * @param client_fd The socket address. - * @return The federate ID for success or -1 for failure. - */ -int32_t receive_and_check_fed_id_message(int socket_id, struct sockaddr_in* client_fd); - -/** - * Listen for a MSG_TYPE_NEIGHBOR_STRUCTURE message, and upon receiving it, fill - * out the relevant information in the federate's struct. - */ -int receive_connection_information(int socket_id, uint16_t fed_id); - -/** - * Listen for a MSG_TYPE_UDP_PORT message, and upon receiving it, set up - * clock synchronization and perform the initial clock synchronization. - * Initial clock synchronization is performed only if the MSG_TYPE_UDP_PORT message - * payload is not UINT16_MAX. If it is also not 0, then this function sets - * up to perform runtime clock synchronization using the UDP port number - * specified in the payload to communicate with the federate's clock - * synchronization logic. - * @param socket_id The socket on which to listen. - * @param fed_id The federate ID. - * @return 1 for success, 0 for failure. - */ -int receive_udp_message_and_set_up_clock_sync(int socket_id, uint16_t fed_id); - -#ifdef __RTI_AUTH__ -/** - * Authenticate incoming federate by performing HMAC-based authentication. - * - * @param socket Socket for the incoming federate tryting to authenticate. - * @return True if authentication is successful and false otherwise. - */ -bool authenticate_federate(int socket); -#endif +void send_reject(int* socket_id, unsigned char error_code); /** * Wait for one incoming connection request from each federate, @@ -449,7 +362,7 @@ bool authenticate_federate(int socket); * that federate. Return when all federates have connected. * @param socket_descriptor The socket on which to accept connections. */ -void connect_to_federates(int socket_descriptor); +void lf_connect_to_federates(int socket_descriptor); /** * Thread to respond to new connections, which could be federates of other diff --git a/core/federated/clock-sync.c b/core/federated/clock-sync.c index e438d83ac..eea5e753c 100644 --- a/core/federated/clock-sync.c +++ b/core/federated/clock-sync.c @@ -71,7 +71,7 @@ instant_t _lf_last_clock_sync_instant = 0LL; /** * The UDP socket descriptor for this federate to communicate with the RTI. - * This is set by setup_clock_synchronization_with_rti() in connect_to_rti() + * This is set by setup_clock_synchronization_with_rti() in lf_connect_to_rti() * in federate.c, which must be called before other * functions that communicate with the rti are called. */ @@ -169,16 +169,14 @@ uint16_t setup_clock_synchronization_with_rti() { _lf_rti_socket_UDP, (struct sockaddr *) &federate_UDP_addr, sizeof(federate_UDP_addr)) < 0) { - lf_print_error_and_exit("Failed to bind its UDP socket: %s.", - strerror(errno)); + lf_print_error_system_failure("Failed to bind its UDP socket."); } // Retrieve the port number that was assigned by the operating system socklen_t addr_length = sizeof(federate_UDP_addr); if (getsockname(_lf_rti_socket_UDP, (struct sockaddr *)&federate_UDP_addr, &addr_length) == -1) { // FIXME: Send 0 UDP_PORT message instead of exiting. // That will disable clock synchronization. - lf_print_error_and_exit("Failed to retrieve UDP port: %s.", - strerror(errno)); + lf_print_error_system_failure("Failed to retrieve UDP port."); } LF_PRINT_DEBUG("Assigned UDP port number %u to its socket.", ntohs(federate_UDP_addr.sin_port)); @@ -205,22 +203,7 @@ uint16_t setup_clock_synchronization_with_rti() { return port_to_return; } -/** - * Synchronize the initial physical clock with the RTI. - * A call to this function is inserted into the startup - * sequence by the code generator if initial clock synchronization - * is required. - * - * This is a blocking function that expects - * to read a MSG_TYPE_CLOCK_SYNC_T1 from the RTI TCP socket. - * It will then follow the PTP protocol to synchronize the local - * physical clock with the RTI. - * Failing to complete this protocol is treated as a catastrophic - * error that causes the federate to exit. - * - * @param rti_socket_TCP The rti's socket - */ -void synchronize_initial_physical_clock_with_rti(int rti_socket_TCP) { +void synchronize_initial_physical_clock_with_rti(int* rti_socket_TCP) { LF_PRINT_DEBUG("Waiting for initial clock synchronization messages from the RTI."); size_t message_size = 1 + sizeof(instant_t); @@ -228,7 +211,7 @@ void synchronize_initial_physical_clock_with_rti(int rti_socket_TCP) { for (int i=0; i < _LF_CLOCK_SYNC_EXCHANGES_PER_INTERVAL; i++) { // The first message expected from the RTI is MSG_TYPE_CLOCK_SYNC_T1 - read_from_socket_errexit(rti_socket_TCP, message_size, buffer, + read_from_socket_fail_on_error(rti_socket_TCP, message_size, buffer, NULL, "Federate %d did not get the initial clock synchronization message T1 from the RTI.", _lf_my_fed_id); @@ -242,12 +225,12 @@ void synchronize_initial_physical_clock_with_rti(int rti_socket_TCP) { // Handle the message and send a reply T3 message. // NOTE: No need to acquire the mutex lock during initialization because only // one thread is running. - if (handle_T1_clock_sync_message(buffer, rti_socket_TCP, receive_time) != 0) { + if (handle_T1_clock_sync_message(buffer, *rti_socket_TCP, receive_time) != 0) { lf_print_error_and_exit("Initial clock sync: Failed to send T3 reply to RTI."); } // Next message from the RTI is required to be MSG_TYPE_CLOCK_SYNC_T4 - read_from_socket_errexit(rti_socket_TCP, message_size, buffer, + read_from_socket_fail_on_error(rti_socket_TCP, message_size, buffer, NULL, "Federate %d did not get the clock synchronization message T4 from the RTI.", _lf_my_fed_id); @@ -257,7 +240,7 @@ void synchronize_initial_physical_clock_with_rti(int rti_socket_TCP) { } // Handle the message. - handle_T4_clock_sync_message(buffer, rti_socket_TCP, receive_time); + handle_T4_clock_sync_message(buffer, *rti_socket_TCP, receive_time); } LF_PRINT_LOG("Finished initial clock synchronization with the RTI."); @@ -294,7 +277,7 @@ int handle_T1_clock_sync_message(unsigned char* buffer, int socket, instant_t t2 // Write the reply to the socket. LF_PRINT_DEBUG("Sending T3 message to RTI."); - if (write_to_socket(socket, 1 + sizeof(int), reply_buffer) != 1 + sizeof(int)) { + if (write_to_socket(socket, 1 + sizeof(int), reply_buffer)) { lf_print_error("Clock sync: Failed to send T3 message to RTI."); return -1; } @@ -361,12 +344,11 @@ void handle_T4_clock_sync_message(unsigned char* buffer, int socket, instant_t r if (socket == _lf_rti_socket_UDP) { // Read the coded probe message. // We can reuse the same buffer. - ssize_t bytes_read = read_from_socket(socket, 1 + sizeof(instant_t), buffer); + int read_failed = read_from_socket(socket, 1 + sizeof(instant_t), buffer); instant_t r5 = lf_time_physical(); - if ((bytes_read < 1 + (ssize_t)sizeof(instant_t)) - || buffer[0] != MSG_TYPE_CLOCK_SYNC_CODED_PROBE) { + if (read_failed || buffer[0] != MSG_TYPE_CLOCK_SYNC_CODED_PROBE) { lf_print_warning("Clock sync: Did not get the expected coded probe message from the RTI. " "Skipping clock synchronization round."); return; diff --git a/core/federated/federate.c b/core/federated/federate.c index c57553464..cd9149e9e 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -1,68 +1,41 @@ /** * @file - * @author Edward A. Lee (eal@berkeley.edu) - * - * @section LICENSE -Copyright (c) 2020, The University of California at Berkeley. - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY -EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL -THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF -THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - * @section DESCRIPTION - * Utility functions for a federate in a federated execution. - * The main entry point is synchronize_with_other_federates(). + * @author Soroush Bateni + * @author Peter Donovan + * @author Edward A. Lee + * @author Anirudh Rengarajsm + * @copyright (c) 2020-2023, The University of California at Berkeley. + * License: BSD 2-clause + * @brief Utility functions for a federate in a federated execution. */ #ifdef FEDERATED -#ifdef PLATFORM_ARDUINO -#error To be implemented. No support for federation on Arduino yet. -#else +#if !defined(PLATFORM_Linux) && !defined(PLATFORM_Darwin) +#error No support for federated execution on this platform. +#endif + #include // inet_ntop & inet_pton #include // Defines getaddrinfo(), freeaddrinfo() and struct addrinfo. #include // Defines struct sockaddr_in - -#include -#include // Defines bzero(). #include -#endif +#include // Defines read(), write(), and close() +#include // Defines memset(), strnlen(), strncmp(), strncpy() +#include // Defines strerror() #include #include // Defined perror(), errno -#include -#include // Defines sigaction. -#include -#include -#include -#include // Defines read(), write(), and close() +#include // Defines bzero(). #include "clock-sync.h" #include "federate.h" -#include "lf_types.h" #include "net_common.h" #include "net_util.h" -#include "platform.h" #include "reactor.h" #include "reactor_common.h" #include "reactor_threaded.h" #include "scheduler.h" #include "trace.h" + #ifdef FEDERATED_AUTHENTICATED #include // For secure random number generation. #include // For HMAC-based authentication of federates. @@ -72,28 +45,42 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. extern instant_t _lf_last_reported_unadjusted_physical_time_ns; extern instant_t start_time; -// Error messages. -char* ERROR_SENDING_HEADER = "ERROR sending header information to federate via RTI"; -char* ERROR_SENDING_MESSAGE = "ERROR sending message to federate via RTI"; +// Global variable defined in reactor_common.c: +extern bool _lf_termination_executed; -// Mutex lock held while performing socket write and close operations. -lf_mutex_t outbound_socket_mutex; -lf_cond_t port_status_changed; -lf_cond_t logical_time_changed; +// Global variables references in federate.h +lf_mutex_t lf_outbound_socket_mutex; +lf_cond_t lf_port_status_changed; +lf_cond_t lf_current_tag_changed; -// Variable to track how far in the reaction queue we can go until we need to wait for more network port statuses to be known. +/** + * The max level allowed to advance (MLAA) is a variable that tracks how far in the reaction + * queue we can go until we need to wait for more network port statuses to be known. + * Specifically, when an input port status is unknown at a tag (we don't know whether the upstream + * federate has sent or will send a message at that tag), then the downstream federate must + * pause before executing any reaction that depends on that port. A "level" is assigned to that + * port by the code generator based on the overall topology of the federation. Reactions that + * depend on the port have higher levels, whereas those with no dependence on that port have + * lower levels. The MLAA is a level at which the federate must block until the MLAA is + * incremented. It will be incremented as port statuses become known, and when all are known, + * it will become INT_MAX and all reactions will be unblocked. In decentralized execution, the + * MLAA is incremented by a background thread that monitors the local physical clock and + * increments the MLAA when it is safe to assume that the port is absent, if it has not already + * been incremented by the arrival of a message. In centralized execution, the MLAA is used + * only for ports that are involved in a zero-delay cycle (ZDC), and it is incremented when + * either a message or an absent message arrives. + */ int max_level_allowed_to_advance; /** - * The state of this federate instance. + * The state of this federate instance. Each executable has exactly one federate instance, + * and the _fed global variable refers to that instance. */ federate_instance_t _fed = { .socket_TCP_RTI = -1, .number_of_inbound_p2p_connections = 0, .inbound_socket_listeners = NULL, .number_of_outbound_p2p_connections = 0, - .sockets_for_inbound_p2p_connections = { -1 }, - .sockets_for_outbound_p2p_connections = { -1 }, .inbound_p2p_handling_thread_id = 0, .server_socket = -1, .server_port = -1, @@ -107,7 +94,6 @@ federate_instance_t _fed = { .min_delay_from_physical_action_to_federate_output = NEVER }; - federation_metadata_t federation_metadata = { .federation_id = "Unidentified Federation", .rti_host = NULL, @@ -115,1048 +101,874 @@ federation_metadata_t federation_metadata = { .rti_user = NULL }; +////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////// +// Static functions (used only internally) /** - * Create a server to listen to incoming physical - * connections from remote federates. This function - * only handles the creation of the server socket. - * The reserved port for the server socket is then - * sent to the RTI by sending an MSG_TYPE_ADDRESS_ADVERTISEMENT message - * (@see net_common.h). This function expects no response - * from the RTI. - * - * If a port is specified by the user, that will be used - * as the only possibility for the server. This function - * will fail if that port is not available. If a port is not - * specified, the STARTING_PORT (@see net_common.h) will be used. - * The function will keep incrementing the port in this case - * until the number of tries reaches PORT_RANGE_LIMIT. - * - * @note This function is similar to create_server(...) in rti.c. - * However, it contains specific log messages for the peer to - * peer connections between federates. It also additionally - * sends an address advertisement (MSG_TYPE_ADDRESS_ADVERTISEMENT) message to the - * RTI informing it of the port. - * - * @param specified_port The specified port by the user. - */ -void create_server(int specified_port) { - if (specified_port > UINT16_MAX || - specified_port < 0) { - lf_print_error( - "create_server(): The specified port (%d) is out of range." - " Starting with %d instead.", - specified_port, - STARTING_PORT - ); - specified_port = 0; - } - uint16_t port = (uint16_t)specified_port; - if (specified_port == 0) { - // Use the default starting port. - port = STARTING_PORT; - } - LF_PRINT_DEBUG("Creating a socket server on port %d.", port); - // Create an IPv4 socket for TCP (not UDP) communication over IP (0). - int socket_descriptor = create_real_time_tcp_socket_errexit(); - - // Server file descriptor. - struct sockaddr_in server_fd; - // Zero out the server address structure. - bzero((char*)&server_fd, sizeof(server_fd)); - - server_fd.sin_family = AF_INET; // IPv4 - server_fd.sin_addr.s_addr = INADDR_ANY; // All interfaces, 0.0.0.0. - // Convert the port number from host byte order to network byte order. - server_fd.sin_port = htons(port); - - int result = bind( - socket_descriptor, - (struct sockaddr *) &server_fd, - sizeof(server_fd)); - // If the binding fails with this port and no particular port was specified - // in the LF program, then try the next few ports in sequence. - while (result != 0 - && specified_port == 0 - && port >= STARTING_PORT - && port <= STARTING_PORT + PORT_RANGE_LIMIT) { - LF_PRINT_DEBUG("Failed to get port %d. Trying %d.", port, port + 1); - port++; - server_fd.sin_port = htons(port); - result = bind( - socket_descriptor, - (struct sockaddr *) &server_fd, - sizeof(server_fd)); - } - if (result != 0) { - if (specified_port == 0) { - lf_print_error_and_exit("Failed to bind socket. Cannot find a usable port. \ - Consider increasing PORT_RANGE_LIMIT in federate.c"); - } else { - lf_print_error_and_exit("Failed to bind socket. Specified port is not available. \ - Consider leaving the port unspecified"); - } - } - LF_PRINT_LOG("Server for communicating with other federates started using port %d.", port); - - // Enable listening for socket connections. - // The second argument is the maximum number of queued socket requests, - // which according to the Mac man page is limited to 128. - listen(socket_descriptor, 128); - - // Set the global server port - _fed.server_port = port; - - // Send the server port number to the RTI - // on an MSG_TYPE_ADDRESS_ADVERTISEMENT message (@see net_common.h). - unsigned char buffer[sizeof(int32_t) + 1]; - buffer[0] = MSG_TYPE_ADDRESS_ADVERTISEMENT; - encode_int32(_fed.server_port, &(buffer[1])); - // Trace the event when tracing is enabled - tracepoint_federate_to_rti(_fed.trace, send_ADR_AD, _lf_my_fed_id, NULL); - write_to_socket_errexit(_fed.socket_TCP_RTI, sizeof(int32_t) + 1, (unsigned char*)buffer, - "Failed to send address advertisement."); - LF_PRINT_DEBUG("Sent port %d to the RTI.", _fed.server_port); - - // Set the global server socket - _fed.server_socket = socket_descriptor; -} - -/** - * Send a message to another federate directly or via the RTI. - * This method assumes that the caller does not hold the outbound_socket_mutex lock, - * which it acquires to perform the send. - * - * If the socket connection to the remote federate or the RTI has been broken, - * then this returns 0 without sending. Otherwise, it returns 1. - * - * @note This function is similar to send_timed_message() except that it - * does not deal with time and timed_messages. - * - * @param message_type The type of the message being sent. - * Currently can be MSG_TYPE_MESSAGE for messages sent via - * RTI or MSG_TYPE_P2P_MESSAGE for messages sent between - * federates. - * @param port The ID of the destination port. - * @param federate The ID of the destination federate. - * @param next_destination_str The name of the next destination in string format - * @param length The message length. - * @param message The message. - * @return 1 if the message has been sent, 0 otherwise. - * FIXME: Currently, federates can send untimed messages to RTI, but there is no - * handling mechanism of MSG_TYPE_MESSAGE at the RTI side. - * Is it really needed? Or should the RTI be updated? - */ -int send_message(int message_type, - unsigned short port, - unsigned short federate, - const char* next_destination_str, - size_t length, - unsigned char* message) { - unsigned char header_buffer[1 + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t)]; - // First byte identifies this as a timed message. - if (message_type != MSG_TYPE_MESSAGE && - message_type != MSG_TYPE_P2P_MESSAGE - ) { - lf_print_error( - "send_message() was called with an invalid message type (%d).", - message_type - ); - return 0; - } - header_buffer[0] = (unsigned char)message_type; - // Next two bytes identify the destination port. - // NOTE: Send messages little endian, not big endian. - encode_uint16(port, &(header_buffer[1])); - - // Next two bytes identify the destination federate. - encode_uint16(federate, &(header_buffer[1 + sizeof(uint16_t)])); - - // The next four bytes are the message length. - encode_int32((int32_t)length, &(header_buffer[1 + sizeof(uint16_t) + sizeof(uint16_t)])); - - LF_PRINT_LOG("Sending untimed message to %s.", next_destination_str); - - // Header: message_type + port_id + federate_id + length of message + timestamp + microstep - const int header_length = 1 + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t); - // Use a mutex lock to prevent multiple threads from simultaneously sending. - lf_mutex_lock(&outbound_socket_mutex); - // First, check that the socket is still connected. This must done - // while holding the mutex lock. - int socket = -1; - if (message_type == MSG_TYPE_P2P_MESSAGE) { - socket = _fed.sockets_for_outbound_p2p_connections[federate]; - } else { - socket = _fed.socket_TCP_RTI; - } - if (socket < 0) { - lf_print_warning("Socket is no longer connected. Dropping message."); - lf_mutex_unlock(&outbound_socket_mutex); - return 0; - } - // Trace the event when tracing is enabled - if (message_type == MSG_TYPE_P2P_MESSAGE) { - tracepoint_federate_to_federate(_fed.trace, send_P2P_MSG, _lf_my_fed_id, federate, NULL); - } else { // message_type == MSG_TYPE_MESSAGE) - tracepoint_federate_to_rti(_fed.trace, send_MSG, _lf_my_fed_id, NULL); - } - write_to_socket_with_mutex(socket, header_length, header_buffer, &outbound_socket_mutex, - "Failed to send message header to to %s.", next_destination_str); - write_to_socket_with_mutex(socket, length, message, &outbound_socket_mutex, - "Failed to send message body to to %s.", next_destination_str); - lf_mutex_unlock(&outbound_socket_mutex); - return 1; -} - -/** - * Send the specified timestamped message to the specified port in the - * specified federate via the RTI or directly to a federate depending on - * the given socket. The timestamp is calculated as current_logical_time + - * additional delay which is greater than or equal to zero. - * The port should be an input port of a reactor in - * the destination federate. This version does include the timestamp - * in the message. The caller can reuse or free the memory after this returns. - * - * If the socket connection to the remote federate or the RTI has been broken, - * then this returns 0 without sending. Otherwise, it returns 1. - * - * This method assumes that the caller does not hold the outbound_socket_mutex lock, - * which it acquires to perform the send. - * - * @note This function is similar to send_message() except that it - * sends timed messages and also contains logics related to time. - * - * @param env The environment of the federate - * @param additional_delay The offset applied to the timestamp - * using after. The additional delay will be greater or equal to zero - * if an after is used on the connection. If no after is given in the - * program, -1 is passed. - * @param message_type The type of the message being sent. - * Currently can be MSG_TYPE_TAGGED_MESSAGE for messages sent via - * RTI or MSG_TYPE_P2P_TAGGED_MESSAGE for messages sent between - * federates. - * @param port The ID of the destination port. - * @param federate The ID of the destination federate. - * @param next_destination_str The next destination in string format (RTI or federate) - * (used for reporting errors). - * @param length The message length. - * @param message The message. - * @return 1 if the message has been sent, 0 otherwise. - */ -int send_timed_message(environment_t* env, - interval_t additional_delay, - int message_type, - unsigned short port, - unsigned short federate, - const char* next_destination_str, - size_t length, - unsigned char* message) { - assert(env != GLOBAL_ENVIRONMENT); - - unsigned char header_buffer[1 + sizeof(uint16_t) + sizeof(uint16_t) - + sizeof(int32_t) + sizeof(instant_t) + sizeof(microstep_t)]; - // First byte identifies this as a timed message. - if (message_type != MSG_TYPE_TAGGED_MESSAGE && - message_type != MSG_TYPE_P2P_TAGGED_MESSAGE - ) { - lf_print_error( - "send_message() was called with an invalid message type (%d).", - message_type - ); - return 0; - } - size_t buffer_head = 0; - header_buffer[buffer_head] = (unsigned char)message_type; - buffer_head += sizeof(unsigned char); - // Next two bytes identify the destination port. - // NOTE: Send messages little endian, not big endian. - encode_uint16(port, &(header_buffer[buffer_head])); - buffer_head += sizeof(uint16_t); - - // Next two bytes identify the destination federate. - encode_uint16(federate, &(header_buffer[buffer_head])); - buffer_head += sizeof(uint16_t); - - // The next four bytes are the message length. - encode_int32((int32_t)length, &(header_buffer[buffer_head])); - buffer_head += sizeof(int32_t); - - // Apply the additional delay to the current tag and use that as the intended - // tag of the outgoing message - tag_t current_message_intended_tag = lf_delay_tag(env->current_tag, - additional_delay); - - // Next 8 + 4 will be the tag (timestamp, microstep) - encode_tag( - &(header_buffer[buffer_head]), - current_message_intended_tag - ); - buffer_head += sizeof(int64_t) + sizeof(uint32_t); - - LF_PRINT_LOG("Sending message with tag " PRINTF_TAG " to %s.", - current_message_intended_tag.time - start_time, current_message_intended_tag.microstep, next_destination_str); - - // Header: message_type + port_id + federate_id + length of message + timestamp + microstep - size_t header_length = buffer_head; - - if (_lf_is_tag_after_stop_tag(env, current_message_intended_tag)) { - // Message tag is past the timeout time (the stop time) so it should - // not be sent. - return 0; - } - - // Use a mutex lock to prevent multiple threads from simultaneously sending. - lf_mutex_lock(&outbound_socket_mutex); - // First, check that the socket is still connected. This must done - // while holding the mutex lock. - int socket = -1; - if (message_type == MSG_TYPE_P2P_TAGGED_MESSAGE) { - socket = _fed.sockets_for_outbound_p2p_connections[federate]; - } else { - socket = _fed.socket_TCP_RTI; - } - if (socket < 0) { - lf_print_warning("Socket is no longer connected. Dropping message."); - lf_mutex_unlock(&outbound_socket_mutex); - return 0; - } - // Trace the event when tracing is enabled - if (message_type == MSG_TYPE_TAGGED_MESSAGE) { - tracepoint_federate_to_rti(_fed.trace, send_TAGGED_MSG, _lf_my_fed_id, ¤t_message_intended_tag); - } else { // message_type == MSG_TYPE_P2P_TAGGED_MESSAGE - tracepoint_federate_to_federate(_fed.trace, send_P2P_TAGGED_MSG, _lf_my_fed_id, federate, ¤t_message_intended_tag); - } - write_to_socket_with_mutex(socket, header_length, header_buffer, &outbound_socket_mutex, - "Failed to send timed message header to %s.", next_destination_str); - write_to_socket_with_mutex(socket, length, message, &outbound_socket_mutex, - "Failed to send timed message body to %s.", next_destination_str); - lf_mutex_unlock(&outbound_socket_mutex); - return 1; -} - -/** - * Send a time to the RTI. - * This is not synchronized. - * It assumes the caller is. + * Send a time to the RTI. This acquires the lf_outbound_socket_mutex. * @param type The message type (MSG_TYPE_TIMESTAMP). * @param time The time. - * @param exit_on_error If set to true, exit the program if sending 'time' fails. - * Print a soft error message otherwise */ -void _lf_send_time(unsigned char type, instant_t time, bool exit_on_error) { +static void send_time(unsigned char type, instant_t time) { LF_PRINT_DEBUG("Sending time " PRINTF_TIME " to the RTI.", time); size_t bytes_to_write = 1 + sizeof(instant_t); unsigned char buffer[bytes_to_write]; buffer[0] = type; encode_int64(time, &(buffer[1])); - lf_mutex_lock(&outbound_socket_mutex); - if (_fed.socket_TCP_RTI < 0) { - lf_print_warning("Socket is no longer connected. Dropping message."); - lf_mutex_unlock(&outbound_socket_mutex); - return; - } - tag_t tag = {.time = time, .microstep = 0}; // Trace the event when tracing is enabled + tag_t tag = {.time = time, .microstep = 0}; tracepoint_federate_to_rti(_fed.trace, send_TIMESTAMP, _lf_my_fed_id, &tag); - ssize_t bytes_written = write_to_socket(_fed.socket_TCP_RTI, bytes_to_write, buffer); - if (bytes_written < (ssize_t)bytes_to_write) { - if (!exit_on_error) { - lf_print_error("Failed to send time " PRINTF_TIME " to the RTI." - " Error code %d: %s", - time - start_time, - errno, - strerror(errno) - ); - - } else if (errno == ENOTCONN) { - // FIXME: Shutdown is probably not working properly because the socket gets disconnected. - lf_print_error("Socket to the RTI is no longer connected. Considering this a soft error."); - } else { - lf_print_error_and_exit("Failed to send time " PRINTF_TIME " to the RTI." - " Error code %d: %s", - time - start_time, - errno, - strerror(errno) - ); - } - } - lf_mutex_unlock(&outbound_socket_mutex); + LF_MUTEX_LOCK(lf_outbound_socket_mutex); + write_to_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_write, buffer, &lf_outbound_socket_mutex, + "Failed to send time " PRINTF_TIME " to the RTI.", time - start_time); + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); } /** * Send a tag to the RTI. - * This is not synchronized. - * It assumes the caller is. - * @param type The message type (MSG_TYPE_NEXT_EVENT_TAG or MSG_TYPE_LOGICAL_TAG_COMPLETE). + * This function acquires the lf_outbound_socket_mutex. + * @param type The message type (MSG_TYPE_NEXT_EVENT_TAG or MSG_TYPE_LATEST_TAG_COMPLETE). * @param tag The tag. - * @param exit_on_error If set to true, exit the program if sending 'tag' fails. - * Print a soft error message otherwise */ -void _lf_send_tag(unsigned char type, tag_t tag, bool exit_on_error) { +static void send_tag(unsigned char type, tag_t tag) { LF_PRINT_DEBUG("Sending tag " PRINTF_TAG " to the RTI.", tag.time - start_time, tag.microstep); size_t bytes_to_write = 1 + sizeof(instant_t) + sizeof(microstep_t); unsigned char buffer[bytes_to_write]; buffer[0] = type; encode_tag(&(buffer[1]), tag); - lf_mutex_lock(&outbound_socket_mutex); + LF_MUTEX_LOCK(lf_outbound_socket_mutex); if (_fed.socket_TCP_RTI < 0) { lf_print_warning("Socket is no longer connected. Dropping message."); - lf_mutex_unlock(&outbound_socket_mutex); + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); return; } trace_event_t event_type = (type == MSG_TYPE_NEXT_EVENT_TAG) ? send_NET : send_LTC; // Trace the event when tracing is enabled tracepoint_federate_to_rti(_fed.trace, event_type, _lf_my_fed_id, &tag); - ssize_t bytes_written = write_to_socket(_fed.socket_TCP_RTI, bytes_to_write, buffer); - if (bytes_written < (ssize_t)bytes_to_write) { - if (!exit_on_error) { - lf_print_error("Failed to send tag " PRINTF_TAG " to the RTI." - " Error code %d: %s", - tag.time - start_time, - tag.microstep, - errno, - strerror(errno) - ); - return; - } else if (errno == ENOTCONN) { - lf_print_error("Socket to the RTI is no longer connected. Considering this a soft error."); - return; - } else { - lf_mutex_unlock(&outbound_socket_mutex); - lf_print_error_and_exit("Failed to send tag " PRINTF_TAG " to the RTI." - " Error code %d: %s", - tag.time - start_time, - tag.microstep, - errno, - strerror(errno) - ); - } - } - lf_mutex_unlock(&outbound_socket_mutex); + write_to_socket_fail_on_error( + &_fed.socket_TCP_RTI, bytes_to_write, buffer, &lf_outbound_socket_mutex, + "Failed to send tag " PRINTF_TAG " to the RTI.", tag.time - start_time, tag.microstep); + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); } /** - * Thread to accept connections from other federates that send this federate - * messages directly (not through the RTI). This thread starts a thread for - * each accepted socket connection and, once it has opened all expected - * sockets, exits. - * @param env_arg pointer to the environment of this federate. + * Return true if either the socket to the RTI is broken or the socket is + * alive and the first unread byte on the socket's queue is MSG_TYPE_FAILED. */ +static bool rti_failed() { + unsigned char first_byte; + ssize_t bytes = peek_from_socket(_fed.socket_TCP_RTI, &first_byte); + if (bytes < 0 || (bytes == 1 && first_byte == MSG_TYPE_FAILED)) return true; + else return false; +} -void* handle_p2p_connections_from_federates(void* env_arg) { - assert(env_arg); - environment_t* env = (environment_t *) env_arg; - int received_federates = 0; - // Allocate memory to store thread IDs. - _fed.inbound_socket_listeners = (lf_thread_t*)calloc(_fed.number_of_inbound_p2p_connections, sizeof(lf_thread_t)); - while (received_federates < _fed.number_of_inbound_p2p_connections) { - // Wait for an incoming connection request. - struct sockaddr client_fd; - uint32_t client_length = sizeof(client_fd); - int socket_id = accept(_fed.server_socket, &client_fd, &client_length); - // FIXME: Error handling here is too harsh maybe? - if (socket_id < 0 && errno != EAGAIN && errno != EWOULDBLOCK) { - lf_print_error("A fatal error occurred while accepting a new socket. " - "Federate will not accept connections anymore."); - return NULL; - } - LF_PRINT_LOG("Accepted new connection from remote federate."); +//////////////////////////////// Port Status Handling /////////////////////////////////////// - size_t header_length = 1 + sizeof(uint16_t) + 1; - unsigned char buffer[header_length]; - ssize_t bytes_read = read_from_socket(socket_id, header_length, (unsigned char*)&buffer); - if (bytes_read != (ssize_t)header_length || buffer[0] != MSG_TYPE_P2P_SENDING_FED_ID) { - lf_print_warning("Federate received invalid first message on P2P socket. Closing socket."); - if (bytes_read >= 0) { - unsigned char response[2]; - response[0] = MSG_TYPE_REJECT; - response[1] = WRONG_SERVER; - // Trace the event when tracing is enabled - tracepoint_federate_to_federate(_fed.trace, send_REJECT, _lf_my_fed_id, -3, NULL); - // Ignore errors on this response. - write_to_socket(socket_id, 2, response); - } - close(socket_id); - continue; - } +extern lf_action_base_t* _lf_action_table[]; +extern interval_t _lf_action_delay_table[]; +extern size_t _lf_action_table_size; +extern lf_action_base_t* _lf_zero_delay_cycle_action_table[]; +extern size_t _lf_zero_delay_cycle_action_table_size; +extern reaction_t* network_input_reactions[]; +extern size_t num_network_input_reactions; +extern reaction_t* port_absent_reaction[]; +extern size_t num_port_absent_reactions; +#ifdef FEDERATED_DECENTRALIZED +extern staa_t* staa_lst[]; +extern size_t staa_lst_size; +#endif - // Get the federation ID and check it. - unsigned char federation_id_length = buffer[header_length - 1]; - char remote_federation_id[federation_id_length]; - bytes_read = read_from_socket(socket_id, federation_id_length, (unsigned char*)remote_federation_id); - if (bytes_read != federation_id_length - || (strncmp(federation_metadata.federation_id, remote_federation_id, strnlen(federation_metadata.federation_id, 255)) != 0)) { - lf_print_warning("Received invalid federation ID. Closing socket."); - if (bytes_read >= 0) { - unsigned char response[2]; - response[0] = MSG_TYPE_REJECT; - response[1] = FEDERATION_ID_DOES_NOT_MATCH; - // Trace the event when tracing is enabled - tracepoint_federate_to_federate(_fed.trace, send_REJECT, _lf_my_fed_id, -3, NULL); - // Ignore errors on this response. - write_to_socket(socket_id, 2, response); - } - close(socket_id); - continue; - } +/** + * Return a pointer to the action struct for the action + * corresponding to the specified port ID. + * @param port_id The port ID. + * @return A pointer to an action struct or null if the ID is out of range. + */ +static lf_action_base_t* action_for_port(int port_id) { + if (port_id >= 0 && port_id < _lf_action_table_size) { + return _lf_action_table[port_id]; + } + lf_print_error_and_exit("Invalid port ID: %d", port_id); + return NULL; +} - // Extract the ID of the sending federate. - uint16_t remote_fed_id = extract_uint16((unsigned char*)&(buffer[1])); - LF_PRINT_DEBUG("Received sending federate ID %d.", remote_fed_id); - - // Trace the event when tracing is enabled - tracepoint_federate_to_federate(_fed.trace, receive_FED_ID, _lf_my_fed_id, remote_fed_id, NULL); - - // Once we record the socket_id here, all future calls to close() on - // the socket should be done while holding a mutex, and this array - // element should be reset to -1 during that critical section. - // Otherwise, there can be race condition where, during termination, - // two threads attempt to simultaneously access the socket. - _fed.sockets_for_inbound_p2p_connections[remote_fed_id] = socket_id; - - // Send an MSG_TYPE_ACK message. - unsigned char response = MSG_TYPE_ACK; - // Trace the event when tracing is enabled - tracepoint_federate_to_federate(_fed.trace, send_ACK, _lf_my_fed_id, remote_fed_id, NULL); - write_to_socket_errexit(socket_id, 1, (unsigned char*)&response, - "Failed to write MSG_TYPE_ACK in response to federate %d.", - remote_fed_id); - - // Start a thread to listen for incoming messages from other federates. - // The fed_id is a uint16_t, which we assume can be safely cast to and from void*. - void* fed_id_arg = (void*)(uintptr_t)remote_fed_id; - int result = lf_thread_create( - &_fed.inbound_socket_listeners[received_federates], - listen_to_federates, - fed_id_arg); - if (result != 0) { - // Failed to create a listening thread. - close(socket_id); - _fed.sockets_for_inbound_p2p_connections[remote_fed_id] = -1; - lf_print_error_and_exit( - "Failed to create a thread to listen for incoming physical connection. Error code: %d.", - result +/** + * Update the last known status tag of all network input ports + * to the value of `tag`, unless that the provided `tag` is less + * than the last_known_status_tag of the port. This is called when + * a TAG signal is received from the RTI in centralized coordination. + * If any update occurs, then this broadcasts on `lf_port_status_changed`. + * + * This assumes the caller holds the mutex. + * + * @param tag The tag on which the latest status of all network input + * ports is known. + */ +static void update_last_known_status_on_input_ports(tag_t tag) { + LF_PRINT_DEBUG("In update_last_known_status_on_input ports."); + bool notify = false; + for (int i = 0; i < _lf_action_table_size; i++) { + lf_action_base_t* input_port_action = _lf_action_table[i]; + // This is called when a TAG is received. + // But it is possible for an input port to have received already + // a message with a larger tag (if there is an after delay on the + // connection), in which case, the last known status tag of the port + // is in the future and should not be rolled back. So in that case, + // we do not update the last known status tag. + if (lf_tag_compare(tag, + input_port_action->trigger->last_known_status_tag) >= 0) { + LF_PRINT_DEBUG( + "Updating the last known status tag of port %d from " PRINTF_TAG " to " PRINTF_TAG ".", + i, + input_port_action->trigger->last_known_status_tag.time - lf_time_start(), + input_port_action->trigger->last_known_status_tag.microstep, + tag.time - lf_time_start(), + tag.microstep ); + input_port_action->trigger->last_known_status_tag = tag; + notify = true; } - - received_federates++; } - - LF_PRINT_LOG("All remote federates are connected."); - return NULL; + // FIXME: We could put a condition variable into the trigger_t + // struct for each network input port, in which case this won't + // be a broadcast but rather a targetted signal. + if (notify && lf_update_max_level(tag, false)) { + // Notify network input reactions + lf_cond_broadcast(&lf_port_status_changed); + } } /** - * Close the socket that sends outgoing messages to the - * specified federate ID. This function assumes the caller holds - * the outbound_socket_mutex mutex lock. - * @param fed_id The ID of the peer federate receiving messages from this - * federate, or -1 if the RTI (centralized coordination). + * @brief Update the last known status tag of a network input port. + * + * First, if the specified tag is less than the current_tag of the top-level + * environment, then ignore the specified tag and use the current_tag. This + * situation can arise if a message has arrived late (an STP violation has occurred). + * + * If the specified tag is greater than the previous last_known_status_tag + * of the port, then update the last_known_status_tag to the new tag. + * + * If the tag is equal to the previous last_known_status_tag, then + * increment the microstep of the last_known_status_tag. This situation can + * occur if a sequence of late messages (STP violations) are occurring all at + * once during an execution of a logical tag. + * + * This function is called when a message or absent message arrives. For decentralized + * coordination, it is also called by the background thread update_ports_from_staa_offsets + * which uses physical time to determine when an input port can be assumed to be absent + * if a message has not been received. + * + * This function assumes the caller holds the mutex on the top-level environment, + * and, if the tag actually increases, it broadcasts on `lf_port_status_changed`. + * + * @param env The top-level environment, whose mutex is assumed to be held. + * @param tag The tag on which the latest status of the specified network input port is known. + * @param portID The port ID. */ -void _lf_close_outbound_socket(int fed_id) { - assert (fed_id >= 0 && fed_id < NUMBER_OF_FEDERATES); - if (_fed.sockets_for_outbound_p2p_connections[fed_id] >= 0) { - shutdown(_fed.sockets_for_outbound_p2p_connections[fed_id], SHUT_RDWR); - close(_fed.sockets_for_outbound_p2p_connections[fed_id]); - _fed.sockets_for_outbound_p2p_connections[fed_id] = -1; +static void update_last_known_status_on_input_port(environment_t* env, tag_t tag, int port_id) { + if (lf_tag_compare(tag, env->current_tag) < 0) tag = env->current_tag; + trigger_t* input_port_action = action_for_port(port_id)->trigger; + int comparison = lf_tag_compare(tag, input_port_action->last_known_status_tag); + if (comparison == 0) tag.microstep++; + if (comparison >= 0) { + LF_PRINT_LOG( + "Updating the last known status tag of port %d from " PRINTF_TAG " to " PRINTF_TAG ".", + port_id, + input_port_action->last_known_status_tag.time - lf_time_start(), + input_port_action->last_known_status_tag.microstep, + tag.time - lf_time_start(), + tag.microstep + ); + input_port_action->last_known_status_tag = tag; + + // Check whether this port update implies a change to MLAA, which may unblock reactions. + // For decentralized coordination, the first argument is NEVER, so it has no effect. + // For centralized, the arguments probably also have no effect, but the port update may. + // Note that it would not be correct to pass `tag` as the first argument because + // there is no guarantee that there is either a TAG or a PTAG for this time. + // The message that triggered this to be called could be from an upstream + // federate that is far ahead of other upstream federates in logical time. + lf_update_max_level(_fed.last_TAG, _fed.is_last_TAG_provisional); + lf_cond_broadcast(&lf_port_status_changed); + } else { + // Message arrivals should be monotonic, so this should not occur. + lf_print_warning("Attempt to update the last known status tag " + "of network input port %d to an earlier tag was ignored.", port_id); } } /** - * For each incoming message socket, we create this thread that listens - * for upstream messages. Currently, the only possible upstream message - * is MSG_TYPE_CLOSE_REQUEST. If this thread receives that message, then closes - * the socket. The idea here is that a peer-to-peer socket connection - * is always closed from the sending end, never from the receiving end. - * This way, any sends in progress complete before the socket is actually - * closed. + * Set the status of network port with id portID. + * + * @param portID The network port ID + * @param status The network port status (port_status_t) */ -void* listen_for_upstream_messages_from_downstream_federates(void* fed_id_ptr) { - uint16_t fed_id = *((uint16_t*)fed_id_ptr); - unsigned char message; - - lf_mutex_lock(&outbound_socket_mutex); - while(_fed.sockets_for_outbound_p2p_connections[fed_id] >= 0) { - // Unlock the mutex before performing a blocking read. - // Note that there is a race condition here, but the read will return - // a failure if the socket gets closed. - lf_mutex_unlock(&outbound_socket_mutex); - - LF_PRINT_DEBUG("Thread listening for MSG_TYPE_CLOSE_REQUEST from federate %d", fed_id); - ssize_t bytes_read = read_from_socket( - _fed.sockets_for_outbound_p2p_connections[fed_id], 1, &message); - // Reacquire the mutex lock before closing or reading the socket again. - lf_mutex_lock(&outbound_socket_mutex); - - if (bytes_read == 1 && message == MSG_TYPE_CLOSE_REQUEST) { - // Received a request to close the socket. - LF_PRINT_DEBUG("Received MSG_TYPE_CLOSE_REQUEST from federate %d.", fed_id); - // Trace the event when tracing is enabled - tracepoint_federate_from_federate(_fed.trace, receive_CLOSE_RQ, _lf_my_fed_id, fed_id, NULL); - _lf_close_outbound_socket(fed_id); - break; - } - if (bytes_read == 0) { - // EOF. - LF_PRINT_DEBUG("Received EOF from federate %d.", fed_id); - _lf_close_outbound_socket(fed_id); - break; - } - if (bytes_read < 0) { - // EOF. - LF_PRINT_DEBUG("Error on socket from federate %d.", fed_id); - _lf_close_outbound_socket(fed_id); - break; - } - } - lf_mutex_unlock(&outbound_socket_mutex); - return NULL; +static void set_network_port_status(int portID, port_status_t status) { + lf_action_base_t* network_input_port_action = action_for_port(portID); + network_input_port_action->trigger->status = status; } /** - * Connect to the federate with the specified id. This established - * connection will then be used in functions such as send_timed_message() - * to send messages directly to the specified federate. - * This function first sends an MSG_TYPE_ADDRESS_QUERY message to the RTI to obtain - * the IP address and port number of the specified federate. It then attempts - * to establish a socket connection to the specified federate. - * If this fails, the program exits. If it succeeds, it sets element [id] of - * the _fed.sockets_for_outbound_p2p_connections global array to - * refer to the socket for communicating directly with the federate. - * @param remote_federate_id The ID of the remote federate. + * Version of schedule_value() similar to that in reactor_common.c + * except that it does not acquire the mutex lock and has a special + * behavior during startup where it can inject reactions to the reaction + * queue if execution has not started yet. + * It is also responsible for setting the intended tag of the + * network message based on the calculated delay. + * This function assumes that the caller holds the mutex lock. + * + * This is used for handling incoming timed messages to a federate. + * + * @param env The environment of the federate + * @param action The action or timer to be triggered. + * @param tag The tag of the message received over the network. + * @param value Dynamically allocated memory containing the value to send. + * @param length The length of the array, if it is an array, or 1 for a + * scalar and 0 for no payload. + * @return A handle to the event, or 0 if no event was scheduled, or -1 for error. */ -void connect_to_federate(uint16_t remote_federate_id) { - int result = -1; - int count_retries = 0; - - // Ask the RTI for port number of the remote federate. - // The buffer is used for both sending and receiving replies. - // The size is what is needed for receiving replies. - unsigned char buffer[sizeof(int32_t) + INET_ADDRSTRLEN]; - int port = -1; - struct in_addr host_ip_addr; - int count_tries = 0; - while (port == -1) { - buffer[0] = MSG_TYPE_ADDRESS_QUERY; - // NOTE: Sending messages in little endian. - encode_uint16(remote_federate_id, &(buffer[1])); - - LF_PRINT_DEBUG("Sending address query for federate %d.", remote_federate_id); - // Trace the event when tracing is enabled - tracepoint_federate_to_rti(_fed.trace, send_ADR_QR, _lf_my_fed_id, NULL); - write_to_socket_errexit(_fed.socket_TCP_RTI, sizeof(uint16_t) + 1, buffer, - "Failed to send address query for federate %d to RTI.", - remote_federate_id); +static trigger_handle_t schedule_message_received_from_network_locked( + environment_t* env, + trigger_t* trigger, + tag_t tag, + lf_token_t* token) { + assert(env != GLOBAL_ENVIRONMENT); - // Read RTI's response. - read_from_socket_errexit(_fed.socket_TCP_RTI, sizeof(int32_t), buffer, - "Failed to read the requested port number for federate %d from RTI.", - remote_federate_id); + // Return value of the function + trigger_handle_t return_value = 0; - port = extract_int32(buffer); + // Indicates whether or not the intended tag + // of the message (timestamp, microstep) is + // in the future relative to the tag of this + // federate. By default, assume it is not. + bool message_tag_is_in_the_future = lf_tag_compare(tag, env->current_tag) > 0; + // Assign the intended tag temporarily to restore later. + tag_t previous_intended_tag = trigger->intended_tag; + trigger->intended_tag = tag; - read_from_socket_errexit(_fed.socket_TCP_RTI, sizeof(host_ip_addr), (unsigned char*)&host_ip_addr, - "Failed to read the IP address for federate %d from RTI.", - remote_federate_id); + // Calculate the extra_delay required to be passed + // to the schedule function. + interval_t extra_delay = tag.time - env->current_tag.time; + if (!message_tag_is_in_the_future && env->execution_started) { +#ifdef FEDERATED_CENTRALIZED + // If the coordination is centralized, receiving a message + // that does not carry a timestamp that is in the future + // would indicate a critical condition, showing that the + // time advance mechanism is not working correctly. + LF_MUTEX_UNLOCK(env->mutex); + lf_print_error_and_exit( + "Received a message at tag " PRINTF_TAG " that has a tag " PRINTF_TAG + " that has violated the STP offset. " + "Centralized coordination should not have these types of messages.", + env->current_tag.time - start_time, env->current_tag.microstep, + tag.time - start_time, tag.microstep); +#else + // Set the delay back to 0 + extra_delay = 0LL; + LF_PRINT_LOG("Calling schedule with 0 delay and intended tag " PRINTF_TAG ".", + trigger->intended_tag.time - start_time, + trigger->intended_tag.microstep); + return_value = _lf_schedule(env, trigger, extra_delay, token); +#endif + } else { + // In case the message is in the future, call + // _lf_schedule_at_tag() so that the microstep is respected. + LF_PRINT_LOG("Received a message that is (" PRINTF_TIME " nanoseconds, " PRINTF_MICROSTEP " microsteps) " + "in the future.", extra_delay, tag.microstep - env->current_tag.microstep); + return_value = _lf_schedule_at_tag(env, trigger, tag, token); + } + trigger->intended_tag = previous_intended_tag; + // Notify the main thread in case it is waiting for physical time to elapse. + LF_PRINT_DEBUG("Broadcasting notification that event queue changed."); + lf_cond_broadcast(&env->event_q_changed); + return return_value; +} - // A reply of -1 for the port means that the RTI does not know - // the port number of the remote federate, presumably because the - // remote federate has not yet sent an MSG_TYPE_ADDRESS_ADVERTISEMENT message to the RTI. - // Sleep for some time before retrying. - if (port == -1) { - if (count_tries++ >= CONNECT_NUM_RETRIES) { - lf_print_error_and_exit("TIMEOUT obtaining IP/port for federate %d from the RTI.", - remote_federate_id); - } - // Wait ADDRESS_QUERY_RETRY_INTERVAL nanoseconds. - if (lf_sleep(ADDRESS_QUERY_RETRY_INTERVAL) != 0) { - // Sleep was interrupted. - continue; +/** + * Close the socket that receives incoming messages from the + * specified federate ID. This function should be called when a read + * of incoming socket fails or when an EOF is received. + * It can also be called when the receiving end wants to stop communication, + * in which case, flag should be 1. + * + * @param fed_id The ID of the peer federate sending messages to this + * federate. + * @param flag 0 if an EOF was received, -1 if a socket error occurred, 1 otherwise. + */ +static void close_inbound_socket(int fed_id, int flag) { + LF_MUTEX_LOCK(socket_mutex); + if (_fed.sockets_for_inbound_p2p_connections[fed_id] >= 0) { + if (flag >= 0) { + if (flag > 0) { + shutdown(_fed.sockets_for_inbound_p2p_connections[fed_id], SHUT_RDWR); + } else { + // Have received EOF from the other end. Send EOF to the other end. + shutdown(_fed.sockets_for_inbound_p2p_connections[fed_id], SHUT_WR); } } + close(_fed.sockets_for_inbound_p2p_connections[fed_id]); + _fed.sockets_for_inbound_p2p_connections[fed_id] = -1; } - assert(port < 65536); - assert(port > 0); - uint16_t uport = (uint16_t)port; + LF_MUTEX_UNLOCK(socket_mutex); +} -#if LOG_LEVEL > 3 - // Print the received IP address in a human readable format - // Create the human readable format of the received address. - // This is avoided unless LOG_LEVEL is high enough to - // subdue the overhead caused by inet_ntop(). - char hostname[INET_ADDRSTRLEN]; - inet_ntop(AF_INET, &host_ip_addr, hostname, INET_ADDRSTRLEN); - LF_PRINT_LOG("Received address %s port %d for federate %d from RTI.", - hostname, uport, remote_federate_id); -#endif +/** + * Return true if reactions need to be inserted directly into the reaction queue and + * false if a call to schedule is needed (the normal case). This function handles zero-delay + * cycles, where processing at a tag must be able to begin before all messages have arrived + * at that tag. This returns true if the following conditions are all true: + * + * 1. the first reaction triggered has a level >= MLAA (a port is or will be blocked on this trigger); + * 2. the intended_tag is equal to the current tag of the environment; + * 3. the intended_tag is greater than the last_tag of the trigger; + * 4. the intended_tag is greater than the last_known_status_tag of the trigger; + * 5. the execution has started (the event queue has been examined); + * 6. the trigger is not physical; + * + * The comparison against the MLAA (condition 1), if true, means that there is a blocking port + * waiting for this trigger (or possibly an earlier blocking port). For condition (2), tardy + * messages are not scheduled now (they are already late), so if a reaction is blocked on + * unknown status of this port, it will be unblocked with an absent. The comparison against the + * last_tag of the trigger (condition 3) ensures that if the message is tardy but there is + * already an earlier tardy message that has been handled (or is being handled), then we + * don't try to handle two messages in the same tag, which is not allowed. For example, there + * could be a case where current tag is 10 with a port absent reaction waiting, and a message + * has arrived with intended_tag 8. This message will eventually cause the port absent reaction + * to exit, but before that, a message with intended_tag of 9 could arrive before the port absent + * reaction has had a chance to exit. The port status is on the other hand changed in this thread, + * and thus, can be checked in this scenario without this race condition. The message with + * intended_tag of 9 in this case needs to wait one microstep to be processed. The check with + * last_known_status_tag (condition 4) deals with messages arriving with identical intended + * tags (which should not happen). This one will be handled late (one microstep later than + * the current tag if 1 and 2 are true). + * + * This function assumes the mutex is held on the environment. + * + * @param env The environment. + * @param trigger The trigger. + * @param intended_tag The intended tag. + */ +static bool handle_message_now(environment_t* env, trigger_t* trigger, tag_t intended_tag) { + return trigger->reactions[0]->index >= max_level_allowed_to_advance + && lf_tag_compare(intended_tag, lf_tag(env)) == 0 + && lf_tag_compare(intended_tag, trigger->last_tag) > 0 + && lf_tag_compare(intended_tag, trigger->last_known_status_tag) > 0 + && env->execution_started + && !trigger->is_physical; +} - // Iterate until we either successfully connect or exceed the number of - // attempts given by CONNECT_NUM_RETRIES. - int socket_id = -1; - while (result < 0) { - // Create an IPv4 socket for TCP (not UDP) communication over IP (0). - socket_id = create_real_time_tcp_socket_errexit(); +/** + * Handle a message being received from a remote federate. + * + * This function assumes the caller does not hold the mutex lock. + * @param socket Pointer to the socket to read the message from. + * @param fed_id The sending federate ID or -1 if the centralized coordination. + * @return 0 for success, -1 for failure. + */ +static int handle_message(int* socket, int fed_id) { + // Read the header. + size_t bytes_to_read = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t); + unsigned char buffer[bytes_to_read]; + if (read_from_socket_close_on_error(socket, bytes_to_read, buffer)) { + // Read failed, which means the socket has been closed between reading the + // message ID byte and here. + return -1; + } - // Server file descriptor. - struct sockaddr_in server_fd; - // Zero out the server_fd struct. - bzero((char*)&server_fd, sizeof(server_fd)); + // Extract the header information. + unsigned short port_id; + unsigned short federate_id; + size_t length; + extract_header(buffer, &port_id, &federate_id, &length); + // Check if the message is intended for this federate + assert(_lf_my_fed_id == federate_id); + LF_PRINT_DEBUG("Receiving message to port %d of length %zu.", port_id, length); - // Set up the server_fd fields. - server_fd.sin_family = AF_INET; // IPv4 - server_fd.sin_addr = host_ip_addr; // Received from the RTI + // Get the triggering action for the corresponding port + lf_action_base_t* action = action_for_port(port_id); - // Convert the port number from host byte order to network byte order. - server_fd.sin_port = htons(uport); - result = connect( - socket_id, - (struct sockaddr *)&server_fd, - sizeof(server_fd)); - - if (result != 0) { - lf_print_error("Failed to connect to federate %d on port %d.", remote_federate_id, uport); - - // Try again after some time if the connection failed. - // Note that this should not really happen since the remote federate should be - // accepting socket connections. But possibly it will be busy (in process of accepting - // another socket connection?). Hence, we retry. - count_retries++; - if (count_retries > CONNECT_NUM_RETRIES) { - // If the remote federate is not accepting the connection after CONNECT_NUM_RETRIES - // treat it as a soft error condition and return. - lf_print_error("Failed to connect to federate %d after %d retries. Giving up.", - remote_federate_id, CONNECT_NUM_RETRIES); - return; - } - lf_print_warning("Could not connect to federate %d. Will try again every %lld nanoseconds.\n", - remote_federate_id, ADDRESS_QUERY_RETRY_INTERVAL); - // Wait ADDRESS_QUERY_RETRY_INTERVAL nanoseconds. - if (lf_sleep(ADDRESS_QUERY_RETRY_INTERVAL) != 0) { - // Sleep was interrupted. - continue; - } - } else { - // Connect was successful. - size_t buffer_length = 1 + sizeof(uint16_t) + 1; - unsigned char buffer[buffer_length]; - buffer[0] = MSG_TYPE_P2P_SENDING_FED_ID; - if (_lf_my_fed_id > UINT16_MAX) { - // This error is very unlikely to occur. - lf_print_error_and_exit("Too many federates! More than %d.", UINT16_MAX); - } - encode_uint16((uint16_t)_lf_my_fed_id, (unsigned char*)&(buffer[1])); - unsigned char federation_id_length = (unsigned char)strnlen(federation_metadata.federation_id, 255); - buffer[sizeof(uint16_t) + 1] = federation_id_length; - // Trace the event when tracing is enabled - tracepoint_federate_to_federate(_fed.trace, send_FED_ID, _lf_my_fed_id, remote_federate_id, NULL); - write_to_socket_errexit(socket_id, - buffer_length, buffer, - "Failed to send fed_id to federate %d.", remote_federate_id); - write_to_socket_errexit(socket_id, - federation_id_length, (unsigned char*)federation_metadata.federation_id, - "Failed to send federation id to federate %d.", - remote_federate_id); - - read_from_socket_errexit(socket_id, 1, (unsigned char*)buffer, - "Failed to read MSG_TYPE_ACK from federate %d in response to sending fed_id.", - remote_federate_id); - if (buffer[0] != MSG_TYPE_ACK) { - // Get the error code. - read_from_socket_errexit(socket_id, 1, (unsigned char*)buffer, - "Failed to read error code from federate %d in response to sending fed_id.", remote_federate_id); - lf_print_error("Received MSG_TYPE_REJECT message from remote federate (%d).", buffer[0]); - result = -1; - continue; - } else { - lf_print("Connected to federate %d, port %d.", remote_federate_id, port); - // Trace the event when tracing is enabled - tracepoint_federate_to_federate(_fed.trace, receive_ACK, _lf_my_fed_id, remote_federate_id, NULL); - } - } + // Read the payload. + // Allocate memory for the message contents. + unsigned char* message_contents = (unsigned char*)malloc(length); + if (read_from_socket_close_on_error(socket, length, message_contents)) { + return -1; } - // Once we set this variable, then all future calls to close() on this - // socket ID should reset it to -1 within a critical section. - _fed.sockets_for_outbound_p2p_connections[remote_federate_id] = socket_id; + // Trace the event when tracing is enabled + tracepoint_federate_from_federate(_fed.trace, receive_P2P_MSG, _lf_my_fed_id, federate_id, NULL); + LF_PRINT_LOG("Message received by federate: %s. Length: %zu.", message_contents, length); - // Start a thread to listen for upstream messages (MSG_TYPE_CLOSE_REQUEST) from - // this downstream federate. - uint16_t* remote_fed_id_copy = (uint16_t*)malloc(sizeof(uint16_t)); - if (remote_fed_id_copy == NULL) { - lf_print_error_and_exit("malloc failed."); - } - *remote_fed_id_copy = remote_federate_id; - lf_thread_t thread_id; - result = lf_thread_create( - &thread_id, - listen_for_upstream_messages_from_downstream_federates, - remote_fed_id_copy); - if (result != 0) { - // Failed to create a listening thread. - lf_print_error_and_exit( - "Failed to create a thread to listen for upstream message. Error code: %d.", - result - ); - } + LF_PRINT_DEBUG("Calling schedule for message received on a physical connection."); + _lf_schedule_value(action, 0, message_contents, length); + return 0; } -#ifdef FEDERATED_AUTHENTICATED /** - * Perform HMAC-based authentication with the RTI, using the federation ID - * as an HMAC key. - * - * @param rti_socket TCP socket for connection with the RTI. + * Handle a tagged message being received from a remote federate via the RTI + * or directly from other federates. + * This will read the tag encoded in the header + * and calculate an offset to pass to the schedule function. + * This function assumes the caller does not hold the mutex lock. + * Instead of holding the mutex lock, this function calls + * _lf_increment_tag_barrier with the tag carried in + * the message header as an argument. This ensures that the current tag + * will not advance to the tag of the message if it is in the future, or + * the tag will not advance at all if the tag of the message is + * now or in the past. + * @param socket Pointer to the socket to read the message from. + * @param fed_id The sending federate ID or -1 if the centralized coordination. + * @return 0 on successfully reading the message, -1 on failure (e.g. due to socket closed). */ -void perform_hmac_authentication(int rti_socket) { - - // Send buffer including message type, federate ID, federate's nonce. - size_t fed_id_length = sizeof(uint16_t); - size_t message_length = 1 + fed_id_length + NONCE_LENGTH; - unsigned char fed_hello_buf[message_length]; - fed_hello_buf[0] = MSG_TYPE_FED_NONCE; - encode_uint16((uint16_t)_lf_my_fed_id, &fed_hello_buf[1]); - unsigned char fed_nonce[NONCE_LENGTH]; - RAND_bytes(fed_nonce, NONCE_LENGTH); - memcpy(&fed_hello_buf[1 + fed_id_length], fed_nonce, NONCE_LENGTH); - write_to_socket(rti_socket, message_length, fed_hello_buf); - - // Check HMAC of received FED_RESPONSE message. - unsigned int hmac_length = SHA256_HMAC_LENGTH; - size_t federation_id_length = strnlen(federation_metadata.federation_id, 255); +static int handle_tagged_message(int* socket, int fed_id) { + // Environment is always the one corresponding to the top-level scheduling enclave. + environment_t *env; + _lf_get_environments(&env); - unsigned char received[1 + NONCE_LENGTH + hmac_length]; - read_from_socket_errexit(rti_socket, 1 + NONCE_LENGTH + hmac_length, received, "Failed to read RTI response."); - if (received[0] != MSG_TYPE_RTI_RESPONSE) { - lf_print_error("Received unexpected response %u from the RTI (see net_common.h).", - received[0]); + // Read the header which contains the timestamp. + size_t bytes_to_read = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t) + + sizeof(instant_t) + sizeof(microstep_t); + unsigned char buffer[bytes_to_read]; + if (read_from_socket_close_on_error(socket, bytes_to_read, buffer)) { + return -1; // Read failed. } - // Create tag to compare to received tag. - unsigned char buf_to_check[1 + fed_id_length + NONCE_LENGTH]; - buf_to_check[0] = MSG_TYPE_RTI_RESPONSE; - encode_uint16((uint16_t)_lf_my_fed_id, &buf_to_check[1]); - memcpy(&buf_to_check[1 + fed_id_length], fed_nonce, NONCE_LENGTH); - unsigned char fed_tag[hmac_length]; - HMAC(EVP_sha256(), federation_metadata.federation_id, federation_id_length, buf_to_check, 1 + fed_id_length + NONCE_LENGTH, - fed_tag, &hmac_length); - // Compare received tag and created tag. - if (memcmp(&received[1 + NONCE_LENGTH], fed_tag, hmac_length) != 0) { - // HMAC does not match. Send back a MSG_TYPE_REJECT message. - lf_print_error("HMAC authentication failed."); - unsigned char response[2]; - response[0] = MSG_TYPE_REJECT; - response[1] = HMAC_DOES_NOT_MATCH; - write_to_socket_errexit( - rti_socket, 2, response, - "Federate failed to write MSG_TYPE_REJECT message on the socket."); - close(rti_socket); - } - else { - LF_PRINT_LOG("HMAC verified."); - // HMAC tag is created with MSG_TYPE_FED_RESPONSE and received federate nonce. - unsigned char mac_buf[1 + NONCE_LENGTH]; - mac_buf[0] = MSG_TYPE_FED_RESPONSE; - memcpy(&mac_buf[1], &received[1], NONCE_LENGTH); - // Buffer for message type and HMAC tag. - unsigned char sender[1 + hmac_length]; - sender[0] = MSG_TYPE_FED_RESPONSE; - HMAC(EVP_sha256(), federation_metadata.federation_id, federation_id_length, mac_buf, 1 + NONCE_LENGTH, - &sender[1], &hmac_length); - write_to_socket(rti_socket, 1 + hmac_length, sender); + // Extract the header information. + unsigned short port_id; + unsigned short federate_id; + size_t length; + tag_t intended_tag; + extract_timed_header(buffer, &port_id, &federate_id, &length, &intended_tag); + // Trace the event when tracing is enabled + if (fed_id == -1) { + tracepoint_federate_from_rti(_fed.trace, receive_TAGGED_MSG, _lf_my_fed_id, &intended_tag); + } else { + tracepoint_federate_from_federate(_fed.trace, receive_P2P_TAGGED_MSG, _lf_my_fed_id, fed_id, &intended_tag); } -} -#endif + // Check if the message is intended for this federate + assert(_lf_my_fed_id == federate_id); + LF_PRINT_DEBUG("Receiving message to port %d of length %zu.", port_id, length); -/** - * Connect to the RTI at the specified host and port and return - * the socket descriptor for the connection. If this fails, the - * program exits. If it succeeds, it sets the _fed.socket_TCP_RTI global - * variable to refer to the socket for communicating with the RTI. - * @param hostname A hostname, such as "localhost". - * @param port_number A port number. - */ -void connect_to_rti(const char* hostname, int port) { - LF_PRINT_LOG("Connecting to the RTI."); + // Get the triggering action for the corresponding port + lf_action_base_t* action = action_for_port(port_id); - // override passed hostname and port if passed as runtime arguments - hostname = federation_metadata.rti_host ? federation_metadata.rti_host : hostname; - port = federation_metadata.rti_port >= 0 ? federation_metadata.rti_port : port; + // Record the physical time of arrival of the message + instant_t time_of_arrival = lf_time_physical(); - uint16_t uport = 0; - if (port < 0 || - port > INT16_MAX) { - lf_print_error( - "connect_to_rti(): Specified port (%d) is out of range," - " using zero instead.", - port - ); - } else { - uport = (uint16_t)port; + if (action->trigger->is_physical) { + // Messages sent on physical connections should be handled via handle_message(). + lf_print_error_and_exit("Received a tagged message on a physical connection."); } - // Repeatedly try to connect, one attempt every 2 seconds, until - // either the program is killed, the sleep is interrupted, - // or the connection succeeds. - // If the specified port is 0, set it instead to the start of the - // port range. - bool specific_port_given = true; - if (uport == 0) { - uport = STARTING_PORT; - specific_port_given = false; +#ifdef FEDERATED_DECENTRALIZED + // Only applicable for federated programs with decentralized coordination: + // For logical connections in decentralized coordination, + // increment the barrier to prevent advancement of tag beyond + // the received tag if possible. The following function call + // suggests that the tag barrier be raised to the tag provided + // by the message. If this tag is in the past, the function will cause + // the tag to freeze at the current level. + // If something happens, make sure to release the barrier. + _lf_increment_tag_barrier(env, intended_tag); +#endif + LF_PRINT_LOG("Received message on port %d with intended tag: " PRINTF_TAG ", Current tag: " PRINTF_TAG ".", + port_id, intended_tag.time - start_time, intended_tag.microstep, + lf_time_logical_elapsed(env), env->current_tag.microstep); + + // Read the payload. + // Allocate memory for the message contents. + unsigned char* message_contents = (unsigned char*)malloc(length); + if (read_from_socket_close_on_error(socket, length, message_contents)) { +#ifdef FEDERATED_DECENTRALIZED + _lf_decrement_tag_barrier_locked(env); +#endif + return -1; // Read failed. } - int result = -1; - int count_retries = 0; - struct addrinfo hints; - struct addrinfo *res; + // The following is only valid for string messages. + // LF_PRINT_DEBUG("Message received: %s.", message_contents); - memset(&hints, 0, sizeof(hints)); - hints.ai_family = AF_INET; /* Allow IPv4 */ - hints.ai_socktype = SOCK_STREAM; /* Stream socket */ - hints.ai_protocol = IPPROTO_TCP; /* TCP protocol */ - hints.ai_addr = NULL; - hints.ai_next = NULL; - hints.ai_flags = AI_NUMERICSERV; /* Allow only numeric port numbers */ + LF_MUTEX_LOCK(env->mutex); - while (result < 0) { - // Convert port number to string - char str[6]; - sprintf(str,"%u",uport); - - // Get address structure matching hostname and hints criteria, and - // set port to the port number provided in str. There should only - // ever be one matching address structure, and we connect to that. - int server = getaddrinfo(hostname, (const char*)&str, &hints, &res); - if (server != 0) { - lf_print_error_and_exit("No host for RTI matching given hostname: %s", hostname); - } + action->trigger->physical_time_of_arrival = time_of_arrival; - // Create a socket - _fed.socket_TCP_RTI = create_real_time_tcp_socket_errexit(); + // Create a token for the message + lf_token_t* message_token = _lf_new_token((token_type_t*)action, message_contents, length); - result = connect(_fed.socket_TCP_RTI, res->ai_addr, res->ai_addrlen); - if (result == 0) { - lf_print("Successfully connected to RTI."); - } + if (handle_message_now(env, action->trigger, intended_tag)) { + // Since the message is intended for the current tag and a port absent reaction + // was waiting for the message, trigger the corresponding reactions for this message. - freeaddrinfo(res); /* No longer needed */ + update_last_known_status_on_input_port(env, intended_tag, port_id); - // If this failed, try more ports, unless a specific port was given. - if (result != 0 - && !specific_port_given - && uport >= STARTING_PORT - && uport <= STARTING_PORT + PORT_RANGE_LIMIT - ) { - lf_print("Failed to connect to RTI on port %d. Trying %d.", uport, uport + 1); - uport++; - // Wait PORT_KNOCKING_RETRY_INTERVAL seconds. - if (lf_sleep(PORT_KNOCKING_RETRY_INTERVAL) != 0) { - // Sleep was interrupted. - continue; + LF_PRINT_LOG( + "Inserting reactions directly at tag " PRINTF_TAG ". " + "Intended tag: " PRINTF_TAG ".", + env->current_tag.time - lf_time_start(), + env->current_tag.microstep, + intended_tag.time - lf_time_start(), + intended_tag.microstep + ); + // Only set the intended tag of the trigger if it is being executed now + // because otherwise this may preempt the intended_tag of a previous activation + // of the trigger. + action->trigger->intended_tag = intended_tag; + + // This will mark the STP violation in the reaction if the message is tardy. + _lf_insert_reactions_for_trigger(env, action->trigger, message_token); + + // Set the status of the port as present here to inform the network input + // port absent reactions know that they no longer need to block. The reason for + // that is because the network receiver reaction is now in the reaction queue + // keeping the precedence order intact. + set_network_port_status(port_id, present); + } else { + // If no port absent reaction is waiting for this message, or if the intended + // tag is in the future, or the message is tardy, use schedule functions to process the message. + + tag_t actual_tag = intended_tag; +#ifdef FEDERATED_DECENTRALIZED + // For tardy messages in decentralized coordination, we need to figure out what the actual tag will be. + // (Centralized coordination errors out with tardy messages). + if (lf_tag_compare(intended_tag, env->current_tag) <= 0) { + // Message is tardy. + actual_tag = env->current_tag; + actual_tag.microstep++; + // Check that this is greater than any previously scheduled event for this port. + trigger_t* input_port_action = action_for_port(port_id)->trigger; + if (lf_tag_compare(actual_tag, input_port_action->last_known_status_tag) <= 0) { + actual_tag = input_port_action->last_known_status_tag; + actual_tag.microstep++; } } - // If this still failed, try again with the original port after some time. - if (result < 0) { - if (!specific_port_given && uport == STARTING_PORT + PORT_RANGE_LIMIT + 1) { - uport = STARTING_PORT; - } - count_retries++; - if (count_retries > CONNECT_NUM_RETRIES) { - lf_print_error_and_exit("Failed to connect to the RTI after %d retries. Giving up.", - CONNECT_NUM_RETRIES); - } - lf_print("Could not connect to RTI at %s. Will try again every %lld seconds.", - hostname, CONNECT_RETRY_INTERVAL / BILLION); - // Wait CONNECT_RETRY_INTERVAL nanoseconds. - if (lf_sleep(CONNECT_RETRY_INTERVAL) != 0) { - // Sleep was interrupted. - continue; - } +#endif // FEDERATED_DECENTRALIZED + // The following will update the input_port_action->last_known_status_tag. + // For decentralized coordination, this is needed for the thread implementing STAA. + update_last_known_status_on_input_port(env, actual_tag, port_id); + + // If the current time >= stop time, discard the message. + // But only if the stop time is not equal to the start time! + if (lf_tag_compare(env->current_tag, env->stop_tag) >= 0 && env->execution_started) { + lf_print_error("Received message too late. Already at stop tag.\n" + " Current tag is " PRINTF_TAG " and intended tag is " PRINTF_TAG ".\n" + " Discarding message and closing the socket.", + env->current_tag.time - start_time, env->current_tag.microstep, + intended_tag.time - start_time, intended_tag.microstep); + // Close socket, reading any incoming data and discarding it. + close_inbound_socket(fed_id, 1); } else { - // Have connected to an RTI, but not sure it's the right RTI. - // Send a MSG_TYPE_FED_IDS message and wait for a reply. - // Notify the RTI of the ID of this federate and its federation. - unsigned char buffer[4]; + // Need to use intended_tag here, not actual_tag, so that STP violations are detected. + // It will become actual_tag (that is when the reactions will be invoked). + schedule_message_received_from_network_locked(env, action->trigger, intended_tag, message_token); + } + } -#ifdef FEDERATED_AUTHENTICATED - LF_PRINT_LOG("Connected to an RTI. Performing HMAC-based authentication using federation ID."); - perform_hmac_authentication(_fed.socket_TCP_RTI); -#else - LF_PRINT_LOG("Connected to an RTI. Sending federation ID for authentication."); +#ifdef FEDERATED_DECENTRALIZED + // Only applicable for federated programs with decentralized coordination + // Finally, decrement the barrier to allow the execution to continue + // past the raised barrier + _lf_decrement_tag_barrier_locked(env); #endif - // Send the message type first. - buffer[0] = MSG_TYPE_FED_IDS; - // Next send the federate ID. - if (_lf_my_fed_id > UINT16_MAX) { - lf_print_error_and_exit("Too many federates! More than %d.", UINT16_MAX); - } - encode_uint16((uint16_t)_lf_my_fed_id, &buffer[1]); - // Next send the federation ID length. - // The federation ID is limited to 255 bytes. - size_t federation_id_length = strnlen(federation_metadata.federation_id, 255); - buffer[1 + sizeof(uint16_t)] = (unsigned char)(federation_id_length & 0xff); - - // Trace the event when tracing is enabled - tracepoint_federate_to_rti(_fed.trace, send_FED_ID, _lf_my_fed_id, NULL); - - write_to_socket_errexit(_fed.socket_TCP_RTI, 2 + sizeof(uint16_t), buffer, - "Failed to send federate ID to RTI."); + // The mutex is unlocked here after the barrier on + // logical time has been removed to avoid + // the need for unecessary lock and unlock + // operations. + LF_MUTEX_UNLOCK(env->mutex); - // Next send the federation ID itself. - write_to_socket_errexit(_fed.socket_TCP_RTI, federation_id_length, (unsigned char*)federation_metadata.federation_id, - "Failed to send federation ID to RTI."); + return 0; +} - // Wait for a response. - // The response will be MSG_TYPE_REJECT if the federation ID doesn't match. - // Otherwise, it will be either MSG_TYPE_ACK or MSG_TYPE_UDP_PORT, where the latter - // is used if clock synchronization will be performed. - unsigned char response; +/** + * Handle a port absent message received from a remote federate. + * This just sets the last known status tag of the port specified + * in the message. + * + * @param socket Pointer to the socket to read the message from + * @param fed_id The sending federate ID or -1 if the centralized coordination. + * @return 0 for success, -1 for failure to complete the read. + */ +static int handle_port_absent_message(int* socket, int fed_id) { + size_t bytes_to_read = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(instant_t) + sizeof(microstep_t); + unsigned char buffer[bytes_to_read]; + if (read_from_socket_close_on_error(socket, bytes_to_read, buffer)) { + return -1; + } - LF_PRINT_DEBUG("Waiting for response to federation ID from the RTI."); + // Extract the header information. + unsigned short port_id = extract_uint16(buffer); + // The next part of the message is the federate_id, but we don't need it. + // unsigned short federate_id = extract_uint16(&(buffer[sizeof(uint16_t)])); + tag_t intended_tag = extract_tag(&(buffer[sizeof(uint16_t)+sizeof(uint16_t)])); - read_from_socket_errexit(_fed.socket_TCP_RTI, 1, &response, "Failed to read response from RTI."); - if (response == MSG_TYPE_REJECT) { - // Trace the event when tracing is enabled - tracepoint_federate_from_rti(_fed.trace, receive_REJECT, _lf_my_fed_id, NULL); - // Read one more byte to determine the cause of rejection. - unsigned char cause; - read_from_socket_errexit(_fed.socket_TCP_RTI, 1, &cause, "Failed to read the cause of rejection by the RTI."); - if (cause == FEDERATION_ID_DOES_NOT_MATCH || cause == WRONG_SERVER) { - lf_print("Connected to the wrong RTI on port %d. Trying %d.", uport, uport + 1); - uport++; - result = -1; - continue; + // Trace the event when tracing is enabled + if (fed_id == -1) { + tracepoint_federate_from_rti(_fed.trace, receive_PORT_ABS, _lf_my_fed_id, &intended_tag); + } else { + tracepoint_federate_from_federate(_fed.trace, receive_PORT_ABS, _lf_my_fed_id, fed_id, &intended_tag); + } + LF_PRINT_LOG("Handling port absent for tag " PRINTF_TAG " for port %hu of fed %d.", + intended_tag.time - lf_time_start(), + intended_tag.microstep, + port_id, + fed_id + ); + + // Environment is always the one corresponding to the top-level scheduling enclave. + environment_t *env; + _lf_get_environments(&env); + + LF_MUTEX_LOCK(env->mutex); + update_last_known_status_on_input_port(env, intended_tag, port_id); + LF_MUTEX_UNLOCK(env->mutex); + + return 0; +} + +/** + * Thread that listens for inputs from other federates. + * This thread listens for messages of type MSG_TYPE_P2P_MESSAGE, + * MSG_TYPE_P2P_TAGGED_MESSAGE, or MSG_TYPE_PORT_ABSENT (@see net_common.h) from the specified + * peer federate and calls the appropriate handling function for + * each message type. If an error occurs or an EOF is received + * from the peer, then this procedure sets the corresponding + * socket in _fed.sockets_for_inbound_p2p_connections + * to -1 and returns, terminating the thread. + * @param _args The remote federate ID (cast to void*). + * @param fed_id_ptr A pointer to a uint16_t containing federate ID being listened to. + * This procedure frees the memory pointed to before returning. + */ +static void* listen_to_federates(void* _args) { + uint16_t fed_id = (uint16_t)(uintptr_t)_args; + + LF_PRINT_LOG("Listening to federate %d.", fed_id); + + int* socket_id = &_fed.sockets_for_inbound_p2p_connections[fed_id]; + + // Buffer for incoming messages. + // This does not constrain the message size + // because the message will be put into malloc'd memory. + unsigned char buffer[FED_COM_BUFFER_SIZE]; + + // Listen for messages from the federate. + while (1) { + bool socket_closed = false; + // Read one byte to get the message type. + LF_PRINT_DEBUG("Waiting for a P2P message on socket %d.", *socket_id); + if (read_from_socket_close_on_error(socket_id, 1, buffer)) { + // Socket has been closed. + lf_print("Socket from federate %d is closed.", fed_id); + // Stop listening to this federate. + socket_closed = true; + break; + } + LF_PRINT_DEBUG("Received a P2P message on socket %d of type %d.", + *socket_id, buffer[0]); + bool bad_message = false; + switch (buffer[0]) { + case MSG_TYPE_P2P_MESSAGE: + LF_PRINT_LOG("Received untimed message from federate %d.", fed_id); + if (handle_message(socket_id, fed_id)) { + // Failed to complete the reading of a message on a physical connection. + lf_print_warning("Failed to complete reading of message on physical connection."); + socket_closed = true; } - lf_print_error_and_exit("RTI Rejected MSG_TYPE_FED_IDS message with response (see net_common.h): " - "%d. Error code: %d. Federate quits.\n", response, cause); - } else if (response == MSG_TYPE_ACK) { - // Trace the event when tracing is enabled - tracepoint_federate_from_rti(_fed.trace, receive_ACK, _lf_my_fed_id, NULL); - LF_PRINT_LOG("Received acknowledgment from the RTI."); - - // Call a generated (external) function that sends information - // about connections between this federate and other federates - // where messages are routed through the RTI. - // @see MSG_TYPE_NEIGHBOR_STRUCTURE in net_common.h - send_neighbor_structure_to_RTI(_fed.socket_TCP_RTI); - - uint16_t udp_port = setup_clock_synchronization_with_rti(); - - // Write the returned port number to the RTI - unsigned char UDP_port_number[1 + sizeof(uint16_t)]; - UDP_port_number[0] = MSG_TYPE_UDP_PORT; - encode_uint16(udp_port, &(UDP_port_number[1])); - write_to_socket_errexit(_fed.socket_TCP_RTI, 1 + sizeof(uint16_t), UDP_port_number, - "Failed to send the UDP port number to the RTI."); - } else { - lf_print_error_and_exit("Received unexpected response %u from the RTI (see net_common.h).", - response); + break; + case MSG_TYPE_P2P_TAGGED_MESSAGE: + LF_PRINT_LOG("Received tagged message from federate %d.", fed_id); + if (handle_tagged_message(socket_id, fed_id)) { + // P2P tagged messages are only used in decentralized coordination, and + // it is not a fatal error if the socket is closed before the whole message is read. + // But this thread should exit. + lf_print_warning("Failed to complete reading of tagged message."); + socket_closed = true; + } + break; + case MSG_TYPE_PORT_ABSENT: + LF_PRINT_LOG("Received port absent message from federate %d.", fed_id); + if (handle_port_absent_message(socket_id, fed_id)) { + // P2P tagged messages are only used in decentralized coordination, and + // it is not a fatal error if the socket is closed before the whole message is read. + // But this thread should exit. + lf_print_warning("Failed to complete reading of tagged message."); + socket_closed = true; + } + break; + default: + bad_message = true; + } + if (bad_message) { + lf_print_error("Received erroneous message type: %d. Closing the socket.", buffer[0]); + // Trace the event when tracing is enabled + tracepoint_federate_from_federate(_fed.trace, receive_UNIDENTIFIED, _lf_my_fed_id, fed_id, NULL); + break; // while loop + } + if (socket_closed) { + // NOTE: For decentralized execution, once this socket is closed, we could + // update last known tags of all ports connected to the specified federate to FOREVER_TAG, + // which would eliminate the need to wait for STAA to assume an input is absent. + // However, at this time, we don't know which ports correspond to which upstream federates. + // The code generator would have to encode this information. Once that is done, + // we could call update_last_known_status_on_input_port with FOREVER_TAG. + + break; // while loop + } + } + return NULL; +} + +/** + * Close the socket that sends outgoing messages to the + * specified federate ID. This function acquires the lf_outbound_socket_mutex mutex lock + * if _lf_normal_termination is true and otherwise proceeds without the lock. + * @param fed_id The ID of the peer federate receiving messages from this + * federate, or -1 if the RTI (centralized coordination). + * @param flag 0 if the socket has received EOF, 1 if not, -1 if abnormal termination. + */ +static void close_outbound_socket(int fed_id, int flag) { + assert (fed_id >= 0 && fed_id < NUMBER_OF_FEDERATES); + if (_lf_normal_termination) { + LF_MUTEX_LOCK(lf_outbound_socket_mutex); + } + if (_fed.sockets_for_outbound_p2p_connections[fed_id] >= 0) { + // Close the socket by sending a FIN packet indicating that no further writes + // are expected. Then read until we get an EOF indication. + if (flag >= 0) { + // SHUT_WR indicates no further outgoing messages. + shutdown(_fed.sockets_for_outbound_p2p_connections[fed_id], SHUT_WR); + if (flag > 0) { + // Have not received EOF yet. read until we get an EOF or error indication. + // This compensates for delayed ACKs and disabling of Nagles algorithm + // by delaying exiting until the shutdown is complete. + unsigned char message[32]; + while (read(_fed.sockets_for_outbound_p2p_connections[fed_id], &message, 32) > 0); } - lf_print("Connected to RTI at %s:%d.", hostname, uport); } + close(_fed.sockets_for_outbound_p2p_connections[fed_id]); + _fed.sockets_for_outbound_p2p_connections[fed_id] = -1; + } + if (_lf_normal_termination) { + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); + } +} + +#ifdef FEDERATED_AUTHENTICATED +/** + * Perform HMAC-based authentication with the RTI, using the federation ID + * as an HMAC key. + * @return 0 for success, -1 for failure. + */ +static int perform_hmac_authentication() { + + // Send buffer including message type, federate ID, federate's nonce. + size_t fed_id_length = sizeof(uint16_t); + size_t message_length = 1 + fed_id_length + NONCE_LENGTH; + unsigned char fed_hello_buf[message_length]; + fed_hello_buf[0] = MSG_TYPE_FED_NONCE; + encode_uint16((uint16_t)_lf_my_fed_id, &fed_hello_buf[1]); + unsigned char fed_nonce[NONCE_LENGTH]; + RAND_bytes(fed_nonce, NONCE_LENGTH); + memcpy(&fed_hello_buf[1 + fed_id_length], fed_nonce, NONCE_LENGTH); + + write_to_socket_fail_on_error( + &_fed.socket_TCP_RTI, message_length, fed_hello_buf, NULL, + "Failed to write nonce."); + + // Check HMAC of received FED_RESPONSE message. + unsigned int hmac_length = SHA256_HMAC_LENGTH; + size_t federation_id_length = strnlen(federation_metadata.federation_id, 255); + + unsigned char received[1 + NONCE_LENGTH + hmac_length]; + if (read_from_socket_close_on_error(&_fed.socket_TCP_RTI, 1 + NONCE_LENGTH + hmac_length, received)) { + lf_print_warning("Failed to read RTI response."); + return -1; + } + if (received[0] != MSG_TYPE_RTI_RESPONSE) { + if (received[0] == MSG_TYPE_FAILED) { + lf_print_error("RTI has failed."); + return -1; + } else { + lf_print_error( + "Received unexpected response %u from the RTI (see net_common.h).", + received[0]); + return -1; + } + } + // Create tag to compare to received tag. + unsigned char buf_to_check[1 + fed_id_length + NONCE_LENGTH]; + buf_to_check[0] = MSG_TYPE_RTI_RESPONSE; + encode_uint16((uint16_t)_lf_my_fed_id, &buf_to_check[1]); + memcpy(&buf_to_check[1 + fed_id_length], fed_nonce, NONCE_LENGTH); + unsigned char fed_tag[hmac_length]; + HMAC(EVP_sha256(), federation_metadata.federation_id, federation_id_length, buf_to_check, 1 + fed_id_length + NONCE_LENGTH, + fed_tag, &hmac_length); + + // Compare received tag and created tag. + if (memcmp(&received[1 + NONCE_LENGTH], fed_tag, hmac_length) != 0) { + // HMAC does not match. Send back a MSG_TYPE_REJECT message. + lf_print_error("HMAC authentication failed."); + unsigned char response[2]; + response[0] = MSG_TYPE_REJECT; + response[1] = HMAC_DOES_NOT_MATCH; + + // Ignore errors on writing back. + write_to_socket(_fed.socket_TCP_RTI, 2, response); + return -1; + } else { + LF_PRINT_LOG("HMAC verified."); + // HMAC tag is created with MSG_TYPE_FED_RESPONSE and received federate nonce. + unsigned char mac_buf[1 + NONCE_LENGTH]; + mac_buf[0] = MSG_TYPE_FED_RESPONSE; + memcpy(&mac_buf[1], &received[1], NONCE_LENGTH); + // Buffer for message type and HMAC tag. + unsigned char sender[1 + hmac_length]; + sender[0] = MSG_TYPE_FED_RESPONSE; + HMAC(EVP_sha256(), federation_metadata.federation_id, federation_id_length, mac_buf, 1 + NONCE_LENGTH, + &sender[1], &hmac_length); + + write_to_socket_fail_on_error( + &_fed.socket_TCP_RTI, 1 + hmac_length, sender, NULL, + "Failed to write fed response."); + } + return 0; +} +#endif + +static void close_rti_socket() { + shutdown(_fed.socket_TCP_RTI, SHUT_RDWR); + close(_fed.socket_TCP_RTI); + _fed.socket_TCP_RTI = -1; +} + +/** + * Return in the result a struct with the address info for the specified hostname and port. + * The memory for the result is dynamically allocated and must be freed using freeaddrinfo. + * @param hostname The host name. + * @param port The port number. + * @param result The struct into which to write. + */ +static void rti_address(const char* hostname, uint16_t port, struct addrinfo** result) { + struct addrinfo hints; + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_INET; /* Allow IPv4 */ + hints.ai_socktype = SOCK_STREAM; /* Stream socket */ + hints.ai_protocol = IPPROTO_TCP; /* TCP protocol */ + hints.ai_addr = NULL; + hints.ai_next = NULL; + hints.ai_flags = AI_NUMERICSERV; /* Allow only numeric port numbers */ + + // Convert port number to string. + char str[6]; + sprintf(str, "%u", port); + + // Get address structure matching hostname and hints criteria, and + // set port to the port number provided in str. There should only + // ever be one matching address structure, and we connect to that. + if (getaddrinfo(hostname, (const char*)&str, &hints, result)) { + lf_print_error_and_exit("No host for RTI matching given hostname: %s", hostname); } } @@ -1169,23 +981,27 @@ void connect_to_rti(const char* hostname, int port) { * @param my_physical_time The physical time at this federate. * @return The designated start time for the federate. */ -instant_t get_start_time_from_rti(instant_t my_physical_time) { +static instant_t get_start_time_from_rti(instant_t my_physical_time) { // Send the timestamp marker first. - _lf_send_time(MSG_TYPE_TIMESTAMP, my_physical_time, true); + send_time(MSG_TYPE_TIMESTAMP, my_physical_time); // Read bytes from the socket. We need 9 bytes. // Buffer for message ID plus timestamp. size_t buffer_length = 1 + sizeof(instant_t); unsigned char buffer[buffer_length]; - read_from_socket_errexit(_fed.socket_TCP_RTI, buffer_length, buffer, + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, buffer_length, buffer, NULL, "Failed to read MSG_TYPE_TIMESTAMP message from RTI."); LF_PRINT_DEBUG("Read 9 bytes."); // First byte received is the message ID. if (buffer[0] != MSG_TYPE_TIMESTAMP) { - lf_print_error_and_exit("Expected a MSG_TYPE_TIMESTAMP message from the RTI. Got %u (see net_common.h).", - buffer[0]); + if (buffer[0] == MSG_TYPE_FAILED) { + lf_print_error_and_exit("RTI has failed."); + } + lf_print_error_and_exit( + "Expected a MSG_TYPE_TIMESTAMP message from the RTI. Got %u (see net_common.h).", + buffer[0]); } instant_t timestamp = extract_int64(&(buffer[1])); @@ -1199,1500 +1015,1381 @@ instant_t get_start_time_from_rti(instant_t my_physical_time) { return timestamp; } -//////////////////////////////// Port Status Handling /////////////////////////////////////// - -extern lf_action_base_t* _lf_action_table[]; -extern interval_t _lf_action_delay_table[]; -extern size_t _lf_action_table_size; -extern lf_action_base_t* _lf_zero_delay_action_table[]; -extern size_t _lf_zero_delay_action_table_size; -extern reaction_t* network_input_reactions[]; -extern size_t num_network_input_reactions; -extern reaction_t* port_absent_reaction[]; -extern size_t num_sender_reactions; -#ifdef FEDERATED_DECENTRALIZED -extern staa_t* staa_lst[]; -extern size_t staa_lst_size; -#endif - /** - * Return a pointer to the action struct for the action - * corresponding to the specified port ID. - * @param port_id The port ID. - * @return A pointer to an action struct or null if the ID is out of range. + * Handle a time advance grant (TAG) message from the RTI. + * This updates the last known status tag for each network input + * port, and broadcasts a signal, which may cause a blocking + * port absent reaction to unblock. + * + * In addition, this updates the last known TAG/PTAG and broadcasts + * a notification of this update, which may unblock whichever worker + * thread is trying to advance time. + * + * @note This function is very similar to handle_provisinal_tag_advance_grant() except that + * it sets last_TAG_was_provisional to false. */ -lf_action_base_t* _lf_action_for_port(int port_id) { - if (port_id < _lf_action_table_size) { - return _lf_action_table[port_id]; +static void handle_tag_advance_grant(void) { + // Environment is always the one corresponding to the top-level scheduling enclave. + environment_t *env; + _lf_get_environments(&env); + + size_t bytes_to_read = sizeof(instant_t) + sizeof(microstep_t); + unsigned char buffer[bytes_to_read]; + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_read, buffer, NULL, + "Failed to read tag advance grant from RTI."); + tag_t TAG = extract_tag(buffer); + + // Trace the event when tracing is enabled + tracepoint_federate_from_rti(_fed.trace, receive_TAG, _lf_my_fed_id, &TAG); + + LF_MUTEX_LOCK(env->mutex); + + // Update the last known status tag of all network input ports + // to the TAG received from the RTI. Here we assume that the RTI + // knows the status of network ports up to and including the granted tag, + // so by extension, we assume that the federate can safely rely + // on the RTI to handle port statuses up until the granted tag. + update_last_known_status_on_input_ports(TAG); + + // It is possible for this federate to have received a PTAG + // earlier with the same tag as this TAG. + if (lf_tag_compare(TAG, _fed.last_TAG) >= 0) { + _fed.last_TAG = TAG; + _fed.is_last_TAG_provisional = false; + LF_PRINT_LOG("Received Time Advance Grant (TAG): " PRINTF_TAG ".", + _fed.last_TAG.time - start_time, _fed.last_TAG.microstep); + } else { + LF_MUTEX_UNLOCK(env->mutex); + lf_print_error("Received a TAG " PRINTF_TAG " that wasn't larger " + "than the previous TAG or PTAG " PRINTF_TAG ". Ignoring the TAG.", + TAG.time - start_time, TAG.microstep, + _fed.last_TAG.time - start_time, _fed.last_TAG.microstep); + return; } - lf_print_error("Invalid port ID: %d", port_id); - return NULL; + // Notify everything that is blocked. + lf_cond_broadcast(&env->event_q_changed); + + LF_MUTEX_UNLOCK(env->mutex); } +#ifdef FEDERATED_DECENTRALIZED /** - * Set the status of network port with id portID. + * @brief Return whether there exists an input port whose status is unknown. * - * @param portID The network port ID - * @param status The network port status (port_status_t) + * @param staa_elem A record of all input port actions. */ -void set_network_port_status(int portID, port_status_t status) { - lf_action_base_t* network_input_port_action = _lf_action_for_port(portID); - network_input_port_action->trigger->status = status; +static bool a_port_is_unknown(staa_t* staa_elem) { + bool do_wait = false; + for (int j = 0; j < staa_elem->num_actions; ++j) { + if (staa_elem->actions[j]->trigger->status == unknown) { + do_wait = true; + break; + } + } + return do_wait; } +#endif /** - * Update the last known status tag of all network input ports - * to the value of `tag`, unless that the provided `tag` is less - * than the last_known_status_tag of the port. This is called when - * all inputs to network ports with tags up to and including `tag` - * have been received by those ports. If any update occurs, - * then this broadcasts on `port_status_changed`. - * - * This assumes the caller holds the mutex. - * - * @param tag The tag on which the latest status of all network input - * ports is known. + * @brief Return the port ID of the port associated with the given action. + * @return The port ID or -1 if there is no match. */ -void update_last_known_status_on_input_ports(tag_t tag) { - LF_PRINT_DEBUG("In update_last_known_status_on_input ports."); - bool notify = false; +static int id_of_action(lf_action_base_t* input_port_action) { for (int i = 0; i < _lf_action_table_size; i++) { - lf_action_base_t* input_port_action = _lf_action_for_port(i); - // This is called when a TAG is received. - // But it is possible for an input port to have received already - // a message with a larger tag (if there is an after delay on the - // connection), in which case, the last known status tag of the port - // is in the future and should not be rolled back. So in that case, - // we do not update the last known status tag. - if (lf_tag_compare(tag, - input_port_action->trigger->last_known_status_tag) >= 0) { - LF_PRINT_DEBUG( - "Updating the last known status tag of port %d from " PRINTF_TAG " to " PRINTF_TAG ".", - i, - input_port_action->trigger->last_known_status_tag.time - lf_time_start(), - input_port_action->trigger->last_known_status_tag.microstep, - tag.time - lf_time_start(), - tag.microstep - ); - input_port_action->trigger->last_known_status_tag = tag; - notify = true; - } - } - // FIXME: We could put a condition variable into the trigger_t - // struct for each network input port, in which case this won't - // be a broadcast but rather a targetted signal. - if (notify && update_max_level(tag, false)) { - // Notify network input reactions - lf_cond_broadcast(&port_status_changed); + if (_lf_action_table[i] == input_port_action) return i; } + return -1; } /** - * Update the last known status tag of a network input port - * to the value of "tag". This is the largest tag at which the status - * (present or absent) of the port is known. - * - * This function assumes the caller holds the mutex, and, if the tag - * actually increases, it broadcasts on `port_status_changed`. - * - * @param tag The tag on which the latest status of network input - * ports is known. - * @param portID The port ID + * @brief Thread handling setting the known absent status of input ports. + * For the code-generated array of staa offsets `staa_lst`, which is sorted by STAA offset, + * wait for physical time to advance to the current time plus the STAA offset, + * then set the absent status of the input ports associated with the STAA. + * Then wait for current time to advance and start over. */ -void update_last_known_status_on_input_port(tag_t tag, int port_id) { - trigger_t* input_port_action = _lf_action_for_port(port_id)->trigger; - if (lf_tag_compare(tag, - input_port_action->last_known_status_tag) >= 0) { - if (lf_tag_compare(tag, - input_port_action->last_known_status_tag) == 0) { - // If the intended tag for an input port is equal to the last known status, we need - // to increment the microstep. This is a direct result of the behavior of the lf_delay_tag() - // semantics in tag.h. - tag.microstep++; +#ifdef FEDERATED_DECENTRALIZED +static void* update_ports_from_staa_offsets(void* args) { + if (staa_lst_size == 0) return NULL; // Nothing to do. + // NOTE: Using only the top-level environment, which is the one that deals with network + // input ports. + environment_t *env; + int num_envs = _lf_get_environments(&env); + LF_MUTEX_LOCK(env->mutex); + while (1) { + LF_PRINT_DEBUG("**** (update thread) starting"); + tag_t tag_when_started_waiting = lf_tag(env); + for (int i = 0; i < staa_lst_size; ++i) { + staa_t* staa_elem = staa_lst[i]; + // The staa_elem is adjusted in the code generator to have subtracted the delay on the connection. + // The list is sorted in increasing order of adjusted STAA offsets. + // The wait_until function automatically adds the _lf_fed_STA_offset to the wait time. + interval_t wait_until_time = env->current_tag.time + staa_elem->STAA; + LF_PRINT_DEBUG("**** (update thread) original wait_until_time: " PRINTF_TIME, wait_until_time - lf_time_start()); + + // The wait_until call will release the env->mutex while it is waiting. + // However, it will not release the env->mutex if the wait time is too small. + // At the cost of a small additional delay in deciding a port is absent, + // we require a minimum wait time here. Otherwise, if both the STAA and STA are + // zero, this thread will fail to ever release the environment mutex. + // This causes chaos. The MIN_SLEEP_DURATION is the smallest amount of time + // that wait_until will actually wait. Note that this strategy does not + // block progress of any execution that is actually processing events. + // It only slightly delays the decision that an event is absent, and only + // if the STAA and STA are extremely small. + if (_lf_fed_STA_offset + staa_elem->STAA < 5 * MIN_SLEEP_DURATION) { + wait_until_time += 5 * MIN_SLEEP_DURATION; + } + while (a_port_is_unknown(staa_elem)) { + LF_PRINT_DEBUG("**** (update thread) waiting until: " PRINTF_TIME, wait_until_time - lf_time_start()); + if (wait_until(env, wait_until_time, &lf_port_status_changed)) { + if (lf_tag_compare(lf_tag(env), tag_when_started_waiting) != 0) { + break; + } + /* Possibly useful for debugging: + tag_t current_tag = lf_tag(env); + LF_PRINT_DEBUG("**** (update thread) Assuming absent! " PRINTF_TAG, current_tag.time - lf_time_start(), current_tag.microstep); + LF_PRINT_DEBUG("**** (update thread) Lag is " PRINTF_TIME, current_tag.time - lf_time_physical()); + LF_PRINT_DEBUG("**** (update thread) Wait until time is " PRINTF_TIME, wait_until_time - lf_time_start()); + */ + + for (int j = 0; j < staa_elem->num_actions; ++j) { + lf_action_base_t* input_port_action = staa_elem->actions[j]; + if (input_port_action->trigger->status == unknown) { + input_port_action->trigger->status = absent; + LF_PRINT_DEBUG("**** (update thread) Assuming port absent at time " PRINTF_TIME, lf_tag(env).time - start_time); + update_last_known_status_on_input_port(env, lf_tag(env), id_of_action(input_port_action)); + lf_cond_broadcast(&lf_port_status_changed); + } + } } - LF_PRINT_DEBUG( - "Updating the last known status tag of port %d from " PRINTF_TAG " to " PRINTF_TAG ".", - port_id, - input_port_action->last_known_status_tag.time - lf_time_start(), - input_port_action->last_known_status_tag.microstep, - tag.time - lf_time_start(), - tag.microstep - ); - input_port_action->last_known_status_tag = tag; - // There is no guarantee that there is either a TAG or a PTAG for this time. - // The message that triggered this to be called could be from an upstream - // federate that is far ahead of other upstream federates in logical time. - // Therefore, do not pass `tag` to `update_max_level`. - update_max_level(_fed.last_TAG, _fed.is_last_TAG_provisional); - lf_cond_broadcast(&port_status_changed); - } else { - LF_PRINT_DEBUG("Attempt to update the last known status tag " - "of network input port %d to an earlier tag was ignored.", port_id); + // If the tag has advanced, start over. + if (lf_tag_compare(lf_tag(env), tag_when_started_waiting) != 0) break; + } + // If the tag has advanced, start over. + if (lf_tag_compare(lf_tag(env), tag_when_started_waiting) != 0) break; + } + // If the tag has advanced, start over. + if (lf_tag_compare(lf_tag(env), tag_when_started_waiting) != 0) continue; + + // At this point, the current tag is the same as when we started waiting + // and all ports should be known, and hence max_level_allowed_to_advance + // should be INT_MAX. Check this to prevent an infinite wait. + if (max_level_allowed_to_advance != INT_MAX) { + // If this occurs, then the current tag advanced during a wait. + // Some ports may have been reset to uknown during that wait, in which case, + // it would be huge mistake to enter the wait for a new tag below because the + // program will freeze. First, check whether any ports are unknown: + bool port_unkonwn = false; + for (int i = 0; i < staa_lst_size; ++i) { + staa_t* staa_elem = staa_lst[i]; + if (a_port_is_unknown(staa_elem)) { + port_unkonwn = true; + break; + } + } + if (!port_unkonwn) { + // If this occurs, then there is a race condition that can lead to deadlocks. + lf_print_error_and_exit("**** (update thread) Inconsistency: All ports are known, but MLAA is blocking."); + } + + // Since max_level_allowed_to_advance will block advancement of time, we cannot follow + // through to the next step without deadlocking. Wait some time, then continue. + // The wait is necessary to prevent a busy wait. + lf_sleep(2 * MIN_SLEEP_DURATION); + continue; + } + + // Wait until we progress to a new tag. + while (lf_tag_compare(lf_tag(env), tag_when_started_waiting) == 0) { + // The following will release the env->mutex while waiting. + LF_PRINT_DEBUG("**** (update thread) Waiting for tags to not match: " PRINTF_TAG ", " PRINTF_TAG, + lf_tag(env).time - lf_time_start(), lf_tag(env).microstep, + tag_when_started_waiting.time -lf_time_start(), tag_when_started_waiting.microstep); + // Ports are reset to unknown at the start of new tag, so that will wake this up. + lf_cond_wait(&lf_port_status_changed); + } + LF_PRINT_DEBUG("**** (update thread) Tags after wait: " PRINTF_TAG ", " PRINTF_TAG, + lf_tag(env).time - lf_time_start(), lf_tag(env).microstep, + tag_when_started_waiting.time -lf_time_start(), tag_when_started_waiting.microstep); } + LF_MUTEX_UNLOCK(env->mutex); } +#endif // FEDERATED_DECENTRALIZED /** - * Reset the status fields on network input ports to unknown. + * Handle a provisional tag advance grant (PTAG) message from the RTI. + * This updates the last known TAG/PTAG and broadcasts + * a notification of this update, which may unblock whichever worker + * thread is trying to advance time. + * If current_time is less than the specified PTAG, then this will + * also insert into the event_q a dummy event with the specified tag. + * This will ensure that the federate advances time to the specified + * tag and, for centralized coordination, stimulates null-message-sending + * output reactions at that tag. * - * @note This function must be called at the beginning of each - * logical time. + * @note This function is similar to handle_tag_advance_grant() except that + * it sets last_TAG_was_provisional to true and also it does not update the + * last known tag for input ports. */ -void reset_status_fields_on_input_port_triggers() { - for (int i = 0; i < _lf_action_table_size; i++) { - set_network_port_status(i, unknown); +static void handle_provisional_tag_advance_grant() { + // Environment is always the one corresponding to the top-level scheduling enclave. + environment_t *env; + _lf_get_environments(&env); + + size_t bytes_to_read = sizeof(instant_t) + sizeof(microstep_t); + unsigned char buffer[bytes_to_read]; + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_read, buffer, NULL, + "Failed to read provisional tag advance grant from RTI."); + tag_t PTAG = extract_tag(buffer); + + // Trace the event when tracing is enabled + tracepoint_federate_from_rti(_fed.trace, receive_PTAG, _lf_my_fed_id, &PTAG); + + // Note: it is important that last_known_status_tag of ports does not + // get updated to a PTAG value because a PTAG does not indicate that + // the RTI knows about the status of all ports up to and _including_ + // the value of PTAG. Only a TAG message indicates that. + LF_MUTEX_LOCK(env->mutex); + + // Sanity check + if (lf_tag_compare(PTAG, _fed.last_TAG) < 0 + || (lf_tag_compare(PTAG, _fed.last_TAG) == 0 && !_fed.is_last_TAG_provisional)) { + LF_MUTEX_UNLOCK(env->mutex); + lf_print_error_and_exit("Received a PTAG " PRINTF_TAG " that is equal or earlier " + "than an already received TAG " PRINTF_TAG ".", + PTAG.time, PTAG.microstep, + _fed.last_TAG.time, _fed.last_TAG.microstep); } - LF_PRINT_DEBUG("Resetting port status fields."); - update_max_level(_fed.last_TAG, _fed.is_last_TAG_provisional); -} -/** - * Enqueue port absent reactions that will send a MSG_TYPE_PORT_ABSENT - * message to downstream federates if a given network output port is not present. - * @param env The environment of the federate - */ -void enqueue_port_absent_reactions(environment_t* env){ - assert(env != GLOBAL_ENVIRONMENT); -#ifdef FEDERATED_CENTRALIZED - if (!_fed.has_downstream) { - // This federate is not connected to any downstream federates via a - // logical connection. No need to trigger port absent - // reactions. + _fed.last_TAG = PTAG; + _fed.is_last_TAG_provisional = true; + LF_PRINT_LOG("At tag " PRINTF_TAG ", received Provisional Tag Advance Grant (PTAG): " PRINTF_TAG ".", + env->current_tag.time - start_time, env->current_tag.microstep, + _fed.last_TAG.time - start_time, _fed.last_TAG.microstep); + + // Even if we don't modify the event queue, we need to broadcast a change + // because we do not need to continue to wait for a TAG. + lf_cond_broadcast(&env->event_q_changed); + // Notify level advance thread which is blocked. + lf_update_max_level(_fed.last_TAG, _fed.is_last_TAG_provisional); + lf_cond_broadcast(&lf_port_status_changed); + + // Possibly insert a dummy event into the event queue if current time is behind + // (which it should be). Do not do this if the federate has not fully + // started yet. + + instant_t dummy_event_time = PTAG.time; + microstep_t dummy_event_relative_microstep = PTAG.microstep; + + if (lf_tag_compare(env->current_tag, PTAG) == 0) { + // The current tag can equal the PTAG if we are at the start time + // or if this federate has been able to advance time to the current + // tag (e.g., it has no upstream federates). In either case, either + // it is already treating the current tag as PTAG cycle (e.g. at the + // start time) or it will be completing the current cycle and sending + // a LTC message shortly. In either case, there is nothing more to do. + LF_MUTEX_UNLOCK(env->mutex); return; - } -#endif - LF_PRINT_DEBUG("Enqueueing port absent reactions at time %lld.", (long long) (env->current_tag.time - start_time)); - if (num_sender_reactions == 0) { - LF_PRINT_DEBUG("No port absent reactions."); + } else if (lf_tag_compare(env->current_tag, PTAG) > 0) { + // Current tag is greater than the PTAG. + // It could be that we have sent an LTC that crossed with the incoming + // PTAG or that we have advanced to a tag greater than the PTAG. + // In the former case, there is nothing more to do. + // In the latter case, we may be blocked processing a PTAG cycle at + // a greater tag or we may be in the middle of processing a regular + // TAG. In either case, we know that at the PTAG tag, all outputs + // have either been sent or are absent, so we can send an LTC. + // Send an LTC to indicate absent outputs. + lf_latest_tag_complete(PTAG); + // Nothing more to do. + LF_MUTEX_UNLOCK(env->mutex); return; + } else if (PTAG.time == env->current_tag.time) { + // We now know env->current_tag < PTAG, but the times are equal. + // Adjust the microstep for scheduling the dummy event. + dummy_event_relative_microstep -= env->current_tag.microstep; } - for (int i = 0; i < num_sender_reactions; i++) { - reaction_t* reaction = port_absent_reaction[i]; - if (reaction && reaction->status == inactive) { - LF_PRINT_DEBUG("Inserting port absent reaction on reaction queue."); - lf_scheduler_trigger_reaction(env->scheduler, reaction, -1); - } + // We now know env->current_tag < PTAG. + + if (dummy_event_time != FOREVER) { + // Schedule a dummy event at the specified time and (relative) microstep. + LF_PRINT_DEBUG("At tag " PRINTF_TAG ", inserting into the event queue a dummy event " + "with time " PRINTF_TIME " and (relative) microstep " PRINTF_MICROSTEP ".", + env->current_tag.time - start_time, env->current_tag.microstep, + dummy_event_time - start_time, dummy_event_relative_microstep); + // Dummy event points to a NULL trigger and NULL real event. + event_t* dummy = _lf_create_dummy_events(env, + NULL, dummy_event_time, NULL, dummy_event_relative_microstep); + pqueue_insert(env->event_q, dummy); } + + LF_MUTEX_UNLOCK(env->mutex); } /** - * Send a port absent message to federate with fed_ID, informing the - * remote federate that the current federate will not produce an event - * on this network port at the current logical time. + * Handle a MSG_TYPE_STOP_GRANTED message from the RTI. * - * @param env The environment of the federate - * @param additional_delay The offset applied to the timestamp - * using after. The additional delay will be greater or equal to zero - * if an after is used on the connection. If no after is given in the - * program, -1 is passed. - * @param port_ID The ID of the receiving port. - * @param fed_ID The fed ID of the receiving federate. + * This function removes the global barrier on + * logical time raised when lf_request_stop() was + * called in the environment for each enclave. */ -void send_port_absent_to_federate(environment_t* env, interval_t additional_delay, - unsigned short port_ID, - unsigned short fed_ID) { - assert(env != GLOBAL_ENVIRONMENT); +static void handle_stop_granted_message() { - // Construct the message - size_t message_length = 1 + sizeof(port_ID) + sizeof(fed_ID) + sizeof(instant_t) + sizeof(microstep_t); - unsigned char buffer[message_length]; + size_t bytes_to_read = MSG_TYPE_STOP_GRANTED_LENGTH - 1; + unsigned char buffer[bytes_to_read]; + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_read, buffer, NULL, + "Failed to read stop granted from RTI."); - // Apply the additional delay to the current tag and use that as the intended - // tag of the outgoing message. Note that if there is delay on the connection, - // then we cannot promise no message with tag = current_tag + delay because a - // subsequent reaction might produce such a message. But we can promise no - // message with a tag strictly less than current_tag + delay. - tag_t current_message_intended_tag = lf_delay_strict(env->current_tag, - additional_delay); + tag_t received_stop_tag = extract_tag(buffer); - LF_PRINT_LOG("Sending port " - "absent for tag " PRINTF_TAG " for port %d to federate %d.", - current_message_intended_tag.time - start_time, - current_message_intended_tag.microstep, - port_ID, fed_ID); + // Trace the event when tracing is enabled + tracepoint_federate_from_rti(_fed.trace, receive_STOP_GRN, _lf_my_fed_id, &received_stop_tag); - buffer[0] = MSG_TYPE_PORT_ABSENT; - encode_uint16(port_ID, &(buffer[1])); - encode_uint16(fed_ID, &(buffer[1+sizeof(port_ID)])); - encode_tag(&(buffer[1+sizeof(port_ID)+sizeof(fed_ID)]), current_message_intended_tag); + LF_PRINT_LOG("Received from RTI a MSG_TYPE_STOP_GRANTED message with elapsed tag " PRINTF_TAG ".", + received_stop_tag.time - start_time, received_stop_tag.microstep); - lf_mutex_lock(&outbound_socket_mutex); -#ifdef FEDERATED_CENTRALIZED - // Send the absent message through the RTI - int socket = _fed.socket_TCP_RTI; -#else - // Send the absent message directly to the federate - int socket = _fed.sockets_for_outbound_p2p_connections[fed_ID]; -#endif - // Do not write if the socket is closed. - if (socket >= 0) { - // Trace the event when tracing is enabled - tracepoint_federate_to_rti(_fed.trace, send_PORT_ABS, _lf_my_fed_id, ¤t_message_intended_tag); - write_to_socket_with_mutex(socket, message_length, buffer, &outbound_socket_mutex, - "Failed to send port absent message for port %hu to federate %hu.", - port_ID, fed_ID); + environment_t *env; + int num_environments = _lf_get_environments(&env); + + for (int i = 0; i < num_environments; i++) { + LF_MUTEX_LOCK(env[i].mutex); + + // Sanity check. + if (lf_tag_compare(received_stop_tag, env[i].current_tag) <= 0) { + lf_print_error("RTI granted a MSG_TYPE_STOP_GRANTED tag that is equal to or less than this federate's current tag " PRINTF_TAG ". " + "Stopping at the next microstep instead.", + env[i].current_tag.time - start_time, env[i].current_tag.microstep); + received_stop_tag = env[i].current_tag; + received_stop_tag.microstep++; + } + + _lf_set_stop_tag(&env[i], received_stop_tag); + LF_PRINT_DEBUG("Setting the stop tag to " PRINTF_TAG ".", + env[i].stop_tag.time - start_time, + env[i].stop_tag.microstep); + + if (env[i].barrier.requestors) _lf_decrement_tag_barrier_locked(&env[i]); + lf_cond_broadcast(&env[i].event_q_changed); + LF_MUTEX_UNLOCK(env[i].mutex); } - lf_mutex_unlock(&outbound_socket_mutex); } -///////////////////////////////////////////////////////////////////////////////////////// - /** - * Version of schedule_value() similar to that in reactor_common.c - * except that it does not acquire the mutex lock and has a special - * behavior during startup where it can inject reactions to the reaction - * queue if execution has not started yet. - * It is also responsible for setting the intended tag of the - * network message based on the calculated delay. - * This function assumes that the caller holds the mutex lock. - * - * This is used for handling incoming timed messages to a federate. - * - * @param env The environment of the federate - * @param action The action or timer to be triggered. - * @param tag The tag of the message received over the network. - * @param value Dynamically allocated memory containing the value to send. - * @param length The length of the array, if it is an array, or 1 for a - * scalar and 0 for no payload. - * @return A handle to the event, or 0 if no event was scheduled, or -1 for error. + * Handle a MSG_TYPE_STOP_REQUEST message from the RTI. */ -static trigger_handle_t schedule_message_received_from_network_locked( - environment_t* env, - trigger_t* trigger, - tag_t tag, - lf_token_t* token) { - assert(env != GLOBAL_ENVIRONMENT); - - // Return value of the function - int return_value = 0; +static void handle_stop_request_message() { + size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_LENGTH - 1; + unsigned char buffer[bytes_to_read]; + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_read, buffer, NULL, + "Failed to read stop request from RTI."); + tag_t tag_to_stop = extract_tag(buffer); - // Indicates whether or not the intended tag - // of the message (timestamp, microstep) is - // in the future relative to the tag of this - // federate. By default, assume it is not. - bool message_tag_is_in_the_future = lf_tag_compare(tag, env->current_tag) > 0; + // Trace the event when tracing is enabled + tracepoint_federate_from_rti(_fed.trace, receive_STOP_REQ, _lf_my_fed_id, &tag_to_stop); + LF_PRINT_LOG("Received from RTI a MSG_TYPE_STOP_REQUEST signal with tag " PRINTF_TAG ".", + tag_to_stop.time - start_time, + tag_to_stop.microstep); - // Assign the intended tag - trigger->intended_tag = tag; + extern lf_mutex_t global_mutex; + extern bool lf_stop_requested; + bool already_blocked = false; - // Calculate the extra_delay required to be passed - // to the schedule function. - interval_t extra_delay = tag.time - env->current_tag.time; - if (!message_tag_is_in_the_future) { -#ifdef FEDERATED_CENTRALIZED - // If the coordination is centralized, receiving a message - // that does not carry a timestamp that is in the future - // would indicate a critical condition, showing that the - // time advance mechanism is not working correctly. - lf_print_error_and_exit("Received a message at tag " PRINTF_TAG " that" - " has a tag " PRINTF_TAG " that has violated the STP offset. " - "Centralized coordination should not have these types of messages.", - env->current_tag.time - start_time, env->current_tag.microstep, - tag.time - start_time, tag.microstep); -#else - // Set the delay back to 0 - extra_delay = 0LL; - LF_PRINT_LOG("Calling schedule with 0 delay and intended tag " PRINTF_TAG ".", - trigger->intended_tag.time - start_time, - trigger->intended_tag.microstep); - return_value = _lf_schedule(env, trigger, extra_delay, token); -#endif - } else { - // In case the message is in the future, call - // _lf_schedule_at_tag() so that the microstep is respected. - LF_PRINT_LOG("Received a message that is (" PRINTF_TIME " nanoseconds, " PRINTF_MICROSTEP " microsteps) " - "in the future.", extra_delay, tag.microstep - env->current_tag.microstep); - return_value = _lf_schedule_at_tag(env, trigger, tag, token); + LF_MUTEX_LOCK(global_mutex); + if (lf_stop_requested) { + LF_PRINT_LOG("Ignoring MSG_TYPE_STOP_REQUEST from RTI because lf_request_stop has been called locally."); + already_blocked = true; } - // Notify the main thread in case it is waiting for physical time to elapse. - LF_PRINT_DEBUG("Broadcasting notification that event queue changed."); - lf_cond_broadcast(&env->event_q_changed); - return return_value; -} - -/** - * Request to close the socket that receives incoming messages from the - * specified federate ID. This sends a message to the upstream federate - * requesting that it close the socket. If the message is sent successfully, - * this returns 1. Otherwise it returns 0, which presumably means that the - * socket is already closed. - * - * @param The ID of the peer federate sending messages to this federate. - * - * @return 1 if the MSG_TYPE_CLOSE_REQUEST message is sent successfully, 0 otherwise. - */ -int _lf_request_close_inbound_socket(int fed_id) { - assert(fed_id >= 0 && fed_id < NUMBER_OF_FEDERATES); + // Treat the stop request from the RTI as if a local stop request had been received. + lf_stop_requested = true; + LF_MUTEX_UNLOCK(global_mutex); - if (_fed.sockets_for_inbound_p2p_connections[fed_id] < 1) return 0; - - // Send a MSG_TYPE_CLOSE_REQUEST message. - unsigned char message_marker = MSG_TYPE_CLOSE_REQUEST; - LF_PRINT_LOG("Sending MSG_TYPE_CLOSE_REQUEST message to upstream federate."); - - // Trace the event when tracing is enabled - tracepoint_federate_to_federate(_fed.trace, send_CLOSE_RQ, _lf_my_fed_id, fed_id, NULL); - - ssize_t written = write_to_socket( - _fed.sockets_for_inbound_p2p_connections[fed_id], - 1, &message_marker); - _fed.sockets_for_inbound_p2p_connections[fed_id] = -1; - if (written == 1) { - LF_PRINT_LOG("Sent MSG_TYPE_CLOSE_REQUEST message to upstream federate."); - return 1; - } else { - return 0; + // If we have previously received from the RTI a stop request, + // or we have previously sent a stop request to the RTI, + // then we have already blocked tag advance in enclaves. + // Do not do this twice. The record of whether the first has occurred + // is guarded by the outbound socket mutex. + // The second is guarded by the global mutex. + // Note that the RTI should not send stop requests more than once to federates. + LF_MUTEX_LOCK(lf_outbound_socket_mutex); + if (_fed.received_stop_request_from_rti) { + LF_PRINT_LOG("Redundant MSG_TYPE_STOP_REQUEST from RTI. Ignoring it."); + already_blocked = true; + } else if (!already_blocked) { + // Do this only if lf_request_stop has not been called because it will + // prevent lf_request_stop from sending. + _fed.received_stop_request_from_rti = true; } -} + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); -/** - * Close the socket that receives incoming messages from the - * specified federate ID or RTI. This function should be called when a read - * of incoming socket fails or when an EOF is received. - * - * @param The ID of the peer federate sending messages to this - * federate, or -1 if the RTI. - */ -void _lf_close_inbound_socket(int fed_id) { - if (fed_id < 0) { - // socket connection is to the RTI. - int socket = _fed.socket_TCP_RTI; - // First, set the global socket to -1. - _fed.socket_TCP_RTI = -1; - // Then shutdown and close the socket. - shutdown(socket, SHUT_RDWR); - close(socket); - } else if (_fed.sockets_for_inbound_p2p_connections[fed_id] >= 0) { - shutdown(_fed.sockets_for_inbound_p2p_connections[fed_id], SHUT_RDWR); - close(_fed.sockets_for_inbound_p2p_connections[fed_id]); - _fed.sockets_for_inbound_p2p_connections[fed_id] = -1; + if (already_blocked) { + // Either we have sent a stop request to the RTI ourselves, + // or we have previously received a stop request from the RTI. + // Nothing more to do. Tag advance is already blocked on enclaves. + return; } -} -/** - * Handle a port absent message received from a remote federate. - * This just sets the last known status tag of the port specified - * in the message. - * - * @param socket The socket to read the message from - * @param buffer The buffer to read - * @param fed_id The sending federate ID or -1 if the centralized coordination. - */ -static void handle_port_absent_message(int socket, int fed_id) { - size_t bytes_to_read = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(instant_t) + sizeof(microstep_t); - unsigned char buffer[bytes_to_read]; - read_from_socket_errexit(socket, bytes_to_read, buffer, - "Failed to read port absent message."); + // Iterate over the scheduling enclaves to find the maximum current tag + // and adjust the tag_to_stop if any of those is greater than tag_to_stop. + // If not done previously, block tag advance in the enclave. + environment_t *env; + int num_environments = _lf_get_environments(&env); + for (int i = 0; i < num_environments; i++) { + LF_MUTEX_LOCK(env[i].mutex); + if (lf_tag_compare(tag_to_stop, env[i].current_tag) <= 0) { + // Can't stop at the requested tag. Make a counteroffer. + tag_to_stop = env->current_tag; + tag_to_stop.microstep++; + } + // Set a barrier to prevent the enclave from advancing past the so-far tag to stop. + _lf_increment_tag_barrier_locked(&env[i], tag_to_stop); - // Extract the header information. - unsigned short port_id = extract_uint16(buffer); - // The next part of the message is the federate_id, but we don't need it. - // unsigned short federate_id = extract_uint16(&(buffer[sizeof(uint16_t)])); - tag_t intended_tag = extract_tag(&(buffer[sizeof(uint16_t)+sizeof(uint16_t)])); + LF_MUTEX_UNLOCK(env[i].mutex); + } + // Send the reply, which is the least tag at which we can stop. + unsigned char outgoing_buffer[MSG_TYPE_STOP_REQUEST_REPLY_LENGTH]; + ENCODE_STOP_REQUEST_REPLY(outgoing_buffer, tag_to_stop.time, tag_to_stop.microstep); // Trace the event when tracing is enabled - if (fed_id == -1) { - tracepoint_federate_from_rti(_fed.trace, receive_PORT_ABS, _lf_my_fed_id, &intended_tag); - } else { - tracepoint_federate_from_federate(_fed.trace, receive_PORT_ABS, _lf_my_fed_id, fed_id, &intended_tag); - } - LF_PRINT_LOG("Handling port absent for tag " PRINTF_TAG " for port %hu of fed %d.", - intended_tag.time - lf_time_start(), - intended_tag.microstep, - port_id, - fed_id - ); + tracepoint_federate_to_rti(_fed.trace, send_STOP_REQ_REP, _lf_my_fed_id, &tag_to_stop); - // Environment is always the one corresponding to the top-level scheduling enclave. - environment_t *env; - _lf_get_environments(&env); + // Send the current logical time to the RTI. + LF_MUTEX_LOCK(lf_outbound_socket_mutex); + write_to_socket_fail_on_error( + &_fed.socket_TCP_RTI, MSG_TYPE_STOP_REQUEST_REPLY_LENGTH, outgoing_buffer, &lf_outbound_socket_mutex, + "Failed to send the answer to MSG_TYPE_STOP_REQUEST to RTI."); + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); - lf_mutex_lock(&env->mutex); -#ifdef FEDERATED_DECENTRALIZED - trigger_t* network_input_port_action = _lf_action_for_port(port_id)->trigger; - if (lf_tag_compare(intended_tag, - network_input_port_action->last_known_status_tag) < 0) { - lf_mutex_unlock(&env->mutex); - } -#endif // In centralized coordination, a TAG message from the RTI - // can set the last_known_status_tag to a future tag where messages - // have not arrived yet. - // Set the mutex status as absent - update_last_known_status_on_input_port(intended_tag, port_id); - lf_mutex_unlock(&env->mutex); + LF_PRINT_DEBUG("Sent MSG_TYPE_STOP_REQUEST_REPLY to RTI with tag " PRINTF_TAG, + tag_to_stop.time, tag_to_stop.microstep); } /** - * Handle a message being received from a remote federate. - * - * This function assumes the caller does not hold the mutex lock. - * @param socket The socket to read the message from - * @param buffer The buffer to read - * @param fed_id The sending federate ID or -1 if the centralized coordination. + * Send a resign signal to the RTI. */ -void handle_message(int socket, int fed_id) { - // FIXME: Need better error handling? - // Read the header. - size_t bytes_to_read = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t); - unsigned char buffer[bytes_to_read]; - read_from_socket_errexit(socket, bytes_to_read, buffer, - "Failed to read message header."); - - // Extract the header information. - unsigned short port_id; - unsigned short federate_id; - size_t length; - extract_header(buffer, &port_id, &federate_id, &length); - // Check if the message is intended for this federate - assert(_lf_my_fed_id == federate_id); - LF_PRINT_DEBUG("Receiving message to port %d of length %zu.", port_id, length); - - // Get the triggering action for the corresponding port - lf_action_base_t* action = _lf_action_for_port(port_id); - - // Read the payload. - // Allocate memory for the message contents. - unsigned char* message_contents = (unsigned char*)malloc(length); - read_from_socket_errexit(socket, length, message_contents, - "Failed to read message body."); - // Trace the event when tracing is enabled - tracepoint_federate_from_federate(_fed.trace, receive_P2P_MSG, _lf_my_fed_id, federate_id, NULL); - LF_PRINT_LOG("Message received by federate: %s. Length: %zu.", message_contents, length); - - LF_PRINT_DEBUG("Calling schedule for message received on a physical connection."); - _lf_schedule_value(action, 0, message_contents, length); +static void send_resign_signal(environment_t* env) { + size_t bytes_to_write = 1; + unsigned char buffer[bytes_to_write]; + buffer[0] = MSG_TYPE_RESIGN; + LF_MUTEX_LOCK(lf_outbound_socket_mutex); + write_to_socket_fail_on_error( + &_fed.socket_TCP_RTI, bytes_to_write, &(buffer[0]), &lf_outbound_socket_mutex, + "Failed to send MSG_TYPE_RESIGN."); + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); + LF_PRINT_LOG("Resigned."); } -void stall_advance_level_federation(environment_t* env, size_t level) { - LF_PRINT_DEBUG("Acquiring the environment mutex."); - lf_mutex_lock(&env->mutex); - LF_PRINT_DEBUG("Waiting on MLAA with next_reaction_level %zu and MLAA %d.", level, max_level_allowed_to_advance); - while (((int) level) >= max_level_allowed_to_advance) { - lf_cond_wait(&port_status_changed); - }; - LF_PRINT_DEBUG("Exiting wait with MLAA %d and next_reaction_level %zu.", max_level_allowed_to_advance, level); - lf_mutex_unlock(&env->mutex); +/** + * Send a failed signal to the RTI. + */ +static void send_failed_signal(environment_t* env) { + size_t bytes_to_write = 1; + unsigned char buffer[bytes_to_write]; + buffer[0] = MSG_TYPE_FAILED; + write_to_socket_fail_on_error( + &_fed.socket_TCP_RTI, bytes_to_write, &(buffer[0]), NULL, + "Failed to send MSG_TYPE_FAILED."); + LF_PRINT_LOG("Failed."); } /** - * Handle a timed message being received from a remote federate via the RTI - * or directly from other federates. - * This will read the tag encoded in the header - * and calculate an offset to pass to the schedule function. - * This function assumes the caller does not hold the mutex lock. - * Instead of holding the mutex lock, this function calls - * _lf_increment_tag_barrier with the tag carried in - * the message header as an argument. This ensures that the current tag - * will not advance to the tag of the message if it is in the future, or - * the tag will not advance at all if the tag of the message is - * now or in the past. - * @param socket The socket to read the message from. - * @param buffer The buffer to read. - * @param fed_id The sending federate ID or -1 if the centralized coordination. + * @brief Stop the traces associated with all environments in the program. */ -void handle_tagged_message(int socket, int fed_id) { - // Environment is always the one corresponding to the top-level scheduling enclave. +static void stop_all_traces() { environment_t *env; - _lf_get_environments(&env); - - // FIXME: Need better error handling? - // Read the header which contains the timestamp. - size_t bytes_to_read = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t) - + sizeof(instant_t) + sizeof(microstep_t); - unsigned char buffer[bytes_to_read]; - read_from_socket_errexit(socket, bytes_to_read, buffer, - "Failed to read timed message header"); - - // Extract the header information. - unsigned short port_id; - unsigned short federate_id; - size_t length; - tag_t intended_tag; - extract_timed_header(buffer, &port_id, &federate_id, &length, &intended_tag); - // Trace the event when tracing is enabled - if (fed_id == -1) { - tracepoint_federate_from_rti(_fed.trace, receive_TAGGED_MSG, _lf_my_fed_id, &intended_tag); - } else { - tracepoint_federate_from_federate(_fed.trace, receive_P2P_TAGGED_MSG, _lf_my_fed_id, fed_id, &intended_tag); - } - // Check if the message is intended for this federate - assert(_lf_my_fed_id == federate_id); - LF_PRINT_DEBUG("Receiving message to port %d of length %zu.", port_id, length); - - // Get the triggering action for the corresponding port - lf_action_base_t* action = _lf_action_for_port(port_id); - - // Record the physical time of arrival of the message - instant_t time_of_arrival = lf_time_physical(); - - if (action->trigger->is_physical) { - // Messages sent on physical connections should be handled via handle_message(). - lf_print_error_and_exit("Received a timed message on a physical connection."); + int num_envs = _lf_get_environments(&env); + for (int i = 0; i < num_envs; i++) { + stop_trace(env[i].trace); } +} -#ifdef FEDERATED_DECENTRALIZED - // Only applicable for federated programs with decentralized coordination: - // For logical connections in decentralized coordination, - // increment the barrier to prevent advancement of tag beyond - // the received tag if possible. The following function call - // suggests that the tag barrier be raised to the tag provided - // by the message. If this tag is in the past, the function will cause - // the tag to freeze at the current level. - // If something happens, make sure to release the barrier. - _lf_increment_tag_barrier(env, intended_tag); -#endif - LF_PRINT_LOG("Received message on port %d with tag: " PRINTF_TAG ", Current tag: " PRINTF_TAG ".", - port_id, intended_tag.time - start_time, intended_tag.microstep, - lf_time_logical_elapsed(env), env->current_tag.microstep); +/** + * Handle a failed signal from the RTI. The RTI will only fail + * if it is forced to exit, e.g. by a SIG_INT. Hence, this federate + * will exit immediately with an error condition, counting on the + * termination functions to handle any cleanup needed. + */ +static void handle_rti_failed_message(void) { + exit(1); +} - // Read the payload. - // Allocate memory for the message contents. - unsigned char* message_contents = (unsigned char*)malloc(length); - read_from_socket_errexit(socket, length, message_contents, - "Failed to read message body."); +/** + * Thread that listens for TCP inputs from the RTI. + * When messages arrive, this calls the appropriate handler. + * @param args Ignored + */ +static void* listen_to_rti_TCP(void* args) { + // Buffer for incoming messages. + // This does not constrain the message size + // because the message will be put into malloc'd memory. + unsigned char buffer[FED_COM_BUFFER_SIZE]; - // The following is only valid for string messages. - // LF_PRINT_DEBUG("Message received: %s.", message_contents); - - lf_mutex_lock(&env->mutex); - - action->trigger->physical_time_of_arrival = time_of_arrival; - - // Create a token for the message - lf_token_t* message_token = _lf_new_token((token_type_t*)action, message_contents, length); - - // FIXME: It might be enough to just check this field and not the status at all - update_last_known_status_on_input_port(intended_tag, port_id); - - // Check whether reactions need to be inserted directly into the reaction - // queue or a call to schedule is needed. This checks if the intended - // tag of the message is for the current tag or a tag that is already - // passed and if any port absent reaction is waiting on this port (or the - // execution hasn't even started). - // If the tag is intended for a tag that is passed, the port absent reactions - // would need to exit because only one message can be processed per tag, - // and that message is going to be a tardy message. The actual tardiness - // handling is done inside _lf_insert_reactions_for_trigger. - // To prevent multiple processing of messages per tag, - // we also need to check the port status. - // For example, there could be a case where current tag is - // 10 with a port absent reaction waiting, and a message has arrived with intended_tag 8. - // This message will eventually cause the port absent reaction to exit, but before that, - // a message with intended_tag of 9 could arrive before the port absent reaction has had a chance - // to exit. The port status is on the other hand changed in this thread, and thus, - // can be checked in this scenario without this race condition. The message with - // intended_tag of 9 in this case needs to wait one microstep to be processed. - if (lf_tag_compare(intended_tag, lf_tag(env)) <= 0 && // The event is meant for the current or a previous tag. - (action->trigger->status == unknown || // if the status of the port is still unknown. - _lf_execution_started == false) // Or, execution hasn't even started, so it's safe to handle this event. - ) { - // Since the message is intended for the current tag and a port absent reaction - // was waiting for the message, trigger the corresponding reactions for this - // message. - LF_PRINT_LOG( - "Inserting reactions directly at tag " PRINTF_TAG ". " - "Intended tag: " PRINTF_TAG ".", - env->current_tag.time - lf_time_start(), - env->current_tag.microstep, - intended_tag.time - lf_time_start(), - intended_tag.microstep - ); - action->trigger->intended_tag = intended_tag; - _lf_insert_reactions_for_trigger(env, action->trigger, message_token); - - // Set the status of the port as present here to inform the network input - // port absent reactions know that they no longer need to block. The reason for - // that is because the network receiver reaction is now in the reaction queue - // keeping the precedence order intact. - set_network_port_status(port_id, present); - - // Port is now present. Therefore, notify the level advancer to proceed - update_max_level(_fed.last_TAG, _fed.is_last_TAG_provisional); - lf_cond_broadcast(&port_status_changed); - } else { - // If no port absent reaction is waiting for this message, or if the intended - // tag is in the future, use schedule functions to process the message. - - // Before that, if the current time >= stop time, discard the message. - // But only if the stop time is not equal to the start time! - if (lf_tag_compare(env->current_tag, env->stop_tag) >= 0) { - lf_print_error("Received message too late. Already at stop tag.\n" - "Current tag is " PRINTF_TAG " and intended tag is " PRINTF_TAG ".\n" - "Discarding message.", - env->current_tag.time - start_time, env->current_tag.microstep, - intended_tag.time - start_time, intended_tag.microstep); - goto release; + // Listen for messages from the federate. + while (1) { + // Check whether the RTI socket is still valid + if (_fed.socket_TCP_RTI < 0) { + lf_print_warning("Socket to the RTI unexpectedly closed."); + return NULL; } - - LF_PRINT_LOG("Calling schedule with tag " PRINTF_TAG ".", intended_tag.time - start_time, intended_tag.microstep); - schedule_message_received_from_network_locked(env, action->trigger, intended_tag, message_token); + // Read one byte to get the message type. + // This will exit if the read fails. + int read_failed = read_from_socket(_fed.socket_TCP_RTI, 1, buffer); + if (read_failed < 0) { + if (errno == ECONNRESET) { + lf_print_error("Socket connection to the RTI was closed by the RTI without" + " properly sending an EOF first. Considering this a soft error."); + // FIXME: If this happens, possibly a new RTI must be elected. + _fed.socket_TCP_RTI = -1; + return NULL; + } else { + lf_print_error("Socket connection to the RTI has been broken with error %d: %s." + " The RTI should close connections with an EOF first." + " Considering this a soft error.", + errno, + strerror(errno)); + // FIXME: If this happens, possibly a new RTI must be elected. + _fed.socket_TCP_RTI = -1; + return NULL; + } + } else if (read_failed > 0) { + // EOF received. + lf_print("Connection to the RTI closed with an EOF."); + _fed.socket_TCP_RTI = -1; + stop_all_traces(); + return NULL; + } + switch (buffer[0]) { + case MSG_TYPE_TAGGED_MESSAGE: + if (handle_tagged_message(&_fed.socket_TCP_RTI, -1)) { + // Failures to complete the read of messages from the RTI are fatal. + lf_print_error_and_exit("Failed to complete the reading of a message from the RTI."); + } + break; + case MSG_TYPE_TAG_ADVANCE_GRANT: + handle_tag_advance_grant(); + break; + case MSG_TYPE_PROVISIONAL_TAG_ADVANCE_GRANT: + handle_provisional_tag_advance_grant(); + break; + case MSG_TYPE_STOP_REQUEST: + handle_stop_request_message(); + break; + case MSG_TYPE_STOP_GRANTED: + handle_stop_granted_message(); + break; + case MSG_TYPE_PORT_ABSENT: + if (handle_port_absent_message(&_fed.socket_TCP_RTI, -1)) { + // Failures to complete the read of absent messages from the RTI are fatal. + lf_print_error_and_exit("Failed to complete the reading of an absent message from the RTI."); + } + break; + case MSG_TYPE_FAILED: + handle_rti_failed_message(); + break; + case MSG_TYPE_CLOCK_SYNC_T1: + case MSG_TYPE_CLOCK_SYNC_T4: + lf_print_error("Federate %d received unexpected clock sync message from RTI on TCP socket.", + _lf_my_fed_id); + break; + default: + lf_print_error_and_exit("Received from RTI an unrecognized TCP message type: %hhx.", buffer[0]); + // Trace the event when tracing is enabled + tracepoint_federate_from_rti(_fed.trace, receive_UNIDENTIFIED, _lf_my_fed_id, NULL); + } } - - release: -#ifdef FEDERATED_DECENTRALIZED // Only applicable for federated programs with decentralized coordination - // Finally, decrement the barrier to allow the execution to continue - // past the raised barrier - _lf_decrement_tag_barrier_locked(env); -#endif - - // The mutex is unlocked here after the barrier on - // logical time has been removed to avoid - // the need for unecessary lock and unlock - // operations. - lf_mutex_unlock(&env->mutex); + return NULL; } /** - * Handle a time advance grant (TAG) message from the RTI. - * This updates the last known status tag for each network input - * port, and broadcasts a signal, which may cause a blocking - * port absent reaction to unblock. - * - * In addition, this updates the last known TAG/PTAG and broadcasts - * a notification of this update, which may unblock whichever worker - * thread is trying to advance time. - * - * @note This function is very similar to handle_provisinal_tag_advance_grant() except that - * it sets last_TAG_was_provisional to false. + * Modify the specified tag, if necessary, to be an earlier tag based + * on the current physical time. The earlier tag is necessary if this federate + * has downstream federates and also has physical actions that may trigger + * outputs. In that case, the earlier tag will be the current physical time + * plus the minimum delay on all such physical actions plus any other delays + * along the path from the triggering physical action to the output port + * minus one nanosecond. The modified tag is assured of being less than any + * output tag that might later be produced. + * @param tag A pointer to the proposed NET. + * @return True if this federate requires this modification and the tag was + * modified. */ -void handle_tag_advance_grant(void) { - // Environment is always the one corresponding to the top-level scheduling enclave. - environment_t *env; - _lf_get_environments(&env); - - size_t bytes_to_read = sizeof(instant_t) + sizeof(microstep_t); - unsigned char buffer[bytes_to_read]; - read_from_socket_errexit(_fed.socket_TCP_RTI, bytes_to_read, buffer, - "Failed to read tag advance grant from RTI."); - tag_t TAG = extract_tag(buffer); - - // Trace the event when tracing is enabled - tracepoint_federate_from_rti(_fed.trace, receive_TAG, _lf_my_fed_id, &TAG); - - lf_mutex_lock(&env->mutex); - - // Update the last known status tag of all network input ports - // to the TAG received from the RTI. Here we assume that the RTI - // knows the status of network ports up to and including the granted tag, - // so by extension, we assume that the federate can safely rely - // on the RTI to handle port statuses up until the granted tag. - update_last_known_status_on_input_ports(TAG); - - // It is possible for this federate to have received a PTAG - // earlier with the same tag as this TAG. - if (lf_tag_compare(TAG, _fed.last_TAG) >= 0) { - _fed.last_TAG.time = TAG.time; - _fed.last_TAG.microstep = TAG.microstep; - _fed.is_last_TAG_provisional = false; - LF_PRINT_LOG("Received Time Advance Grant (TAG): " PRINTF_TAG ".", - _fed.last_TAG.time - start_time, _fed.last_TAG.microstep); - } else { - lf_mutex_unlock(&env->mutex); - lf_print_error("Received a TAG " PRINTF_TAG " that wasn't larger " - "than the previous TAG or PTAG " PRINTF_TAG ". Ignoring the TAG.", - TAG.time - start_time, TAG.microstep, - _fed.last_TAG.time - start_time, _fed.last_TAG.microstep); +static bool bounded_NET(tag_t* tag) { + // The tag sent by this function is a promise that, absent + // inputs from another federate, this federate will not produce events + // earlier than t. But if there are downstream federates and there is + // a physical action (not counting receivers from upstream federates), + // then we can only promise up to current physical time (plus the minimum + // of all minimum delays on the physical actions). + // In this case, we send a NET message with the current physical time + // to permit downstream federates to advance. To avoid + // overwhelming the network, this NET message should be sent periodically + // at specified intervals controlled by the target parameter + // coordination-options: {advance-message-interval: time units}. + // The larger the interval, the more downstream federates will lag + // behind real time, but the less network traffic. If this option is + // missing, we issue a warning message suggesting that a redesign + // might be in order so that outputs don't depend on physical actions. + LF_PRINT_DEBUG("Checking NET to see whether it should be bounded by physical time." + " Min delay from physical action: " PRINTF_TIME ".", + _fed.min_delay_from_physical_action_to_federate_output); + if (_fed.min_delay_from_physical_action_to_federate_output >= 0LL + && _fed.has_downstream + ) { + // There is a physical action upstream of some output from this + // federate, and there is at least one downstream federate. + // Compare the tag to the current physical time. + instant_t physical_time = lf_time_physical(); + if (physical_time + _fed.min_delay_from_physical_action_to_federate_output < tag->time) { + // Can only promise up and not including this new time: + tag->time = physical_time + _fed.min_delay_from_physical_action_to_federate_output - 1L; + tag->microstep = 0; + LF_PRINT_LOG("Has physical actions that bound NET to " PRINTF_TAG ".", + tag->time - start_time, tag->microstep); + return true; + } } - // Notify everything that is blocked. - lf_cond_broadcast(&env->event_q_changed); - - lf_mutex_unlock(&env->mutex); + return false; } +////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////// +// Public functions (declared in reactor.h) +// An empty version of this function is code generated for unfederated execution. + /** - * Send a logical tag complete (LTC) message to the RTI - * unless an equal or later LTC has previously been sent. - * This function assumes the caller holds the mutex lock. - * - * @param tag_to_send The tag to send. + * Close sockets used to communicate with other federates, if they are open, + * and send a MSG_TYPE_RESIGN message to the RTI. This implements the function + * defined in reactor.h. For unfederated execution, the code generator + * generates an empty implementation. + * @param env The environment of the federate */ -void _lf_logical_tag_complete(tag_t tag_to_send) { - int compare_with_last_tag = lf_tag_compare(_fed.last_sent_LTC, tag_to_send); - if (compare_with_last_tag >= 0) { - return; +void terminate_execution(environment_t* env) { + assert(env != GLOBAL_ENVIRONMENT); + + // For an abnormal termination (e.g. a SIGINT), we need to send a + // MSG_TYPE_FAILED message to the RTI, but we should not acquire a mutex. + if (_fed.socket_TCP_RTI >= 0) { + if (_lf_normal_termination) { + tracepoint_federate_to_rti(_fed.trace, send_RESIGN, _lf_my_fed_id, &env->current_tag); + send_resign_signal(env); + } else { + tracepoint_federate_to_rti(_fed.trace, send_FAILED, _lf_my_fed_id, &env->current_tag); + send_failed_signal(env); + } } - LF_PRINT_LOG("Sending Logical Time Complete (LTC) " PRINTF_TAG " to the RTI.", - tag_to_send.time - start_time, - tag_to_send.microstep); - _lf_send_tag(MSG_TYPE_LOGICAL_TAG_COMPLETE, tag_to_send, true); - _fed.last_sent_LTC = tag_to_send; -} -bool update_max_level(tag_t tag, bool is_provisional) { - // This always needs the top-level environment, which will be env[0]. - environment_t *env; - _lf_get_environments(&env); - int prev_max_level_allowed_to_advance = max_level_allowed_to_advance; - max_level_allowed_to_advance = INT_MAX; - if ((lf_tag_compare(env->current_tag, tag) < 0) || ( - lf_tag_compare(env->current_tag, tag) == 0 && !is_provisional - )) { - LF_PRINT_DEBUG("Updated MLAA to %d at time " PRINTF_TIME ".", - max_level_allowed_to_advance, - lf_time_logical_elapsed(env) - ); - // Safe to complete the current tag - return (prev_max_level_allowed_to_advance != max_level_allowed_to_advance); + LF_PRINT_DEBUG("Closing incoming P2P sockets."); + // Close any incoming P2P sockets that are still open. + for (int i=0; i < NUMBER_OF_FEDERATES; i++) { + close_inbound_socket(i, 1); + // Ignore errors. Mark the socket closed. + _fed.sockets_for_inbound_p2p_connections[i] = -1; } -#ifdef FEDERATED_DECENTRALIZED - size_t action_table_size = _lf_action_table_size; - lf_action_base_t** action_table = _lf_action_table; -#else - size_t action_table_size = _lf_zero_delay_action_table_size; - lf_action_base_t** action_table = _lf_zero_delay_action_table; -#endif // FEDERATED_DECENTRALIZED - for (int i = 0; i < action_table_size; i++) { - lf_action_base_t* input_port_action = action_table[i]; -#ifdef FEDERATED_DECENTRALIZED - // In decentralized execution, if the current_tag is close enough to the - // start tag and there is a large enough delay on an incoming - // connection, then there is no need to block progress waiting for this - // port status. - if ( - (_lf_action_delay_table[i] == 0 && env->current_tag.time == start_time && env->current_tag.microstep == 0) - || (_lf_action_delay_table[i] > 0 && lf_tag_compare( - env->current_tag, - lf_delay_strict((tag_t) {.time=start_time, .microstep=0}, _lf_action_delay_table[i]) - ) <= 0) - ) { - continue; - } -#endif // FEDERATED_DECENTRALIZED - if (lf_tag_compare(env->current_tag, - input_port_action->trigger->last_known_status_tag) > 0 - && !input_port_action->trigger->is_physical) { - max_level_allowed_to_advance = LF_MIN( - max_level_allowed_to_advance, - ((int) LF_LEVEL(input_port_action->trigger->reactions[0]->index)) - ); - } + + // Check for all outgoing physical connections in + // _fed.sockets_for_outbound_p2p_connections and + // if the socket ID is not -1, the connection is still open. + // Send an EOF by closing the socket here. + for (int i=0; i < NUMBER_OF_FEDERATES; i++) { + + // Close outbound connections, in case they have not closed themselves. + // This will result in EOF being sent to the remote federate, except for + // abnormal termination, in which case it will just close the socket. + int flag = _lf_normal_termination? 1 : -1; + close_outbound_socket(i, flag); } - LF_PRINT_DEBUG("Updated MLAA to %d at time " PRINTF_TIME ".", - max_level_allowed_to_advance, - lf_time_logical_elapsed(env) - ); - return (prev_max_level_allowed_to_advance != max_level_allowed_to_advance); -} -#ifdef FEDERATED_DECENTRALIZED -/** - * @brief Return whether there exists an input port whose status is unknown. - * - * @param staa_elem A record of all input port actions. - */ -static bool a_port_is_unknown(staa_t* staa_elem) { - bool do_wait = false; - for (int j = 0; j < staa_elem->numActions; ++j) { - if (staa_elem->actions[j]->trigger->status == unknown) { - do_wait = true; - break; + LF_PRINT_DEBUG("Waiting for inbound p2p socket listener threads."); + // Wait for each inbound socket listener thread to close. + if (_fed.number_of_inbound_p2p_connections > 0 && _fed.inbound_socket_listeners != NULL) { + LF_PRINT_LOG("Waiting for %zu threads listening for incoming messages to exit.", + _fed.number_of_inbound_p2p_connections); + for (int i=0; i < _fed.number_of_inbound_p2p_connections; i++) { + // Ignoring errors here. + lf_thread_join(_fed.inbound_socket_listeners[i], NULL); } } - return do_wait; -} -#endif -/** - * @brief Return the port ID of the port associated with the given action. - */ -static int id_of_action(lf_action_base_t* input_port_action) { - for (int i = 0; 1; i++) { - if (_lf_action_for_port(i) == input_port_action) return i; + LF_PRINT_DEBUG("Waiting for RTI's socket listener threads."); + // Wait for the thread listening for messages from the RTI to close. + lf_thread_join(_fed.RTI_socket_listener, NULL); + + // For abnormal termination, there is no need to free memory. + if (_lf_normal_termination) { + LF_PRINT_DEBUG("Freeing memory occupied by the federate."); + free(_fed.inbound_socket_listeners); + free(federation_metadata.rti_host); + free(federation_metadata.rti_user); } - // There will be no UB buffer overrun because _lf_action_for_port(i) has a check. } -/** - * @brief Given a list of staa offsets and its associated triggers, - * have a single thread work to set ports to absent at a given logical time - * - */ -#ifdef FEDERATED_DECENTRALIZED -static void* update_ports_from_staa_offsets(void* args) { - environment_t *env; - int num_envs = _lf_get_environments(&env); - while (1) { - bool restart = false; - tag_t tag_when_started_waiting = lf_tag(env); - for (int i = 0; i < staa_lst_size; ++i) { - staa_t* staa_elem = staa_lst[i]; - interval_t wait_until_time = env->current_tag.time + staa_elem->STAA + _lf_fed_STA_offset - _lf_action_delay_table[i]; - lf_mutex_lock(&env->mutex); - // Both before and after the wait, check that the tag has not changed - if (a_port_is_unknown(staa_elem) && lf_tag_compare(lf_tag(env), tag_when_started_waiting) == 0 && wait_until(env, wait_until_time, &port_status_changed) && lf_tag_compare(lf_tag(env), tag_when_started_waiting) == 0) { - for (int j = 0; j < staa_elem->numActions; ++j) { - lf_action_base_t* input_port_action = staa_elem->actions[j]; - if (input_port_action->trigger->status == unknown) { - input_port_action->trigger->status = absent; - LF_PRINT_DEBUG("Assuming port absent at time %lld.", (long long) (lf_tag(env).time - start_time)); - update_last_known_status_on_input_port(lf_tag(env), id_of_action(input_port_action)); - update_max_level(_fed.last_TAG, _fed.is_last_TAG_provisional); - lf_cond_broadcast(&port_status_changed); - } - } - lf_mutex_unlock(&env->mutex); - } else if (lf_tag_compare(lf_tag(env), tag_when_started_waiting) != 0) { - // We have committed to a new tag before we finish processing the list. Start over. - restart = true; - lf_mutex_unlock(&env->mutex); - break; + +////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////// +// Public functions (declared in federate.h, in alphabetical order) + +void lf_connect_to_federate(uint16_t remote_federate_id) { + int result = -1; + int count_retries = 0; + + // Ask the RTI for port number of the remote federate. + // The buffer is used for both sending and receiving replies. + // The size is what is needed for receiving replies. + unsigned char buffer[sizeof(int32_t) + INET_ADDRSTRLEN + 1]; + int port = -1; + struct in_addr host_ip_addr; + int count_tries = 0; + while (port == -1 && !_lf_termination_executed) { + buffer[0] = MSG_TYPE_ADDRESS_QUERY; + // NOTE: Sending messages in little endian. + encode_uint16(remote_federate_id, &(buffer[1])); + + LF_PRINT_DEBUG("Sending address query for federate %d.", remote_federate_id); + // Trace the event when tracing is enabled + tracepoint_federate_to_rti(_fed.trace, send_ADR_QR, _lf_my_fed_id, NULL); + + LF_MUTEX_LOCK(lf_outbound_socket_mutex); + write_to_socket_fail_on_error( + &_fed.socket_TCP_RTI, sizeof(uint16_t) + 1, buffer, &lf_outbound_socket_mutex, + "Failed to send address query for federate %d to RTI.", + remote_federate_id); + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); + + // Read RTI's response. + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, sizeof(int32_t) + 1, buffer, NULL, + "Failed to read the requested port number for federate %d from RTI.", + remote_federate_id); + + if (buffer[0] != MSG_TYPE_ADDRESS_QUERY) { + // Unexpected reply. Could be that RTI has failed and sent a resignation. + if (buffer[0] == MSG_TYPE_FAILED) { + lf_print_error_and_exit("RTI has failed."); } else { - lf_mutex_unlock(&env->mutex); + lf_print_error_and_exit("Unexpected reply of type %hhu from RTI (see net_common.h).", buffer[0]); } } - if (restart) continue; + port = extract_int32(&buffer[1]); - lf_mutex_lock(&env->mutex); - while (lf_tag_compare(lf_tag(env), tag_when_started_waiting) == 0) { - lf_cond_wait(&logical_time_changed); + read_from_socket_fail_on_error( + &_fed.socket_TCP_RTI, sizeof(host_ip_addr), (unsigned char*)&host_ip_addr, NULL, + "Failed to read the IP address for federate %d from RTI.", + remote_federate_id); + + // A reply of -1 for the port means that the RTI does not know + // the port number of the remote federate, presumably because the + // remote federate has not yet sent an MSG_TYPE_ADDRESS_ADVERTISEMENT message to the RTI. + // Sleep for some time before retrying. + if (port == -1) { + if (count_tries++ >= CONNECT_MAX_RETRIES) { + lf_print_error_and_exit("TIMEOUT obtaining IP/port for federate %d from the RTI.", + remote_federate_id); + } + // Wait ADDRESS_QUERY_RETRY_INTERVAL nanoseconds. + lf_sleep(ADDRESS_QUERY_RETRY_INTERVAL); } - lf_mutex_unlock(&env->mutex); } -} + assert(port < 65536); + assert(port > 0); + uint16_t uport = (uint16_t)port; -/** - * @brief Spawns a thread to iterate through STAA structs, setting its associated ports absent - * at an offset if the port is not present with a value by a certain physical time. - * - */ -void spawn_staa_thread(){ - lf_thread_create(&_fed.staaSetter, update_ports_from_staa_offsets, NULL); -} +#if LOG_LEVEL > 3 + // Print the received IP address in a human readable format + // Create the human readable format of the received address. + // This is avoided unless LOG_LEVEL is high enough to + // subdue the overhead caused by inet_ntop(). + char hostname[INET_ADDRSTRLEN]; + inet_ntop(AF_INET, &host_ip_addr, hostname, INET_ADDRSTRLEN); + LF_PRINT_LOG("Received address %s port %d for federate %d from RTI.", + hostname, uport, remote_federate_id); #endif -/** - * Handle a provisional tag advance grant (PTAG) message from the RTI. - * This updates the last known TAG/PTAG and broadcasts - * a notification of this update, which may unblock whichever worker - * thread is trying to advance time. - * If current_time is less than the specified PTAG, then this will - * also insert into the event_q a dummy event with the specified tag. - * This will ensure that the federate advances time to the specified - * tag and, for centralized coordination, stimulates null-message-sending - * output reactions at that tag. - * - * @note This function is similar to handle_tag_advance_grant() except that - * it sets last_TAG_was_provisional to true and also it does not update the - * last known tag for input ports. - */ -void handle_provisional_tag_advance_grant() { - // Environment is always the one corresponding to the top-level scheduling enclave. - environment_t *env; - _lf_get_environments(&env); + // Iterate until we either successfully connect or exceed the number of + // attempts given by CONNECT_MAX_RETRIES. + int socket_id = -1; + while (result < 0 && !_lf_termination_executed) { + // Create an IPv4 socket for TCP (not UDP) communication over IP (0). + socket_id = create_real_time_tcp_socket_errexit(); - size_t bytes_to_read = sizeof(instant_t) + sizeof(microstep_t); - unsigned char buffer[bytes_to_read]; - read_from_socket_errexit(_fed.socket_TCP_RTI, bytes_to_read, buffer, - "Failed to read provisional tag advance grant from RTI."); - tag_t PTAG = extract_tag(buffer); + // Server file descriptor. + struct sockaddr_in server_fd; + // Zero out the server_fd struct. + bzero((char*)&server_fd, sizeof(server_fd)); - // Trace the event when tracing is enabled - tracepoint_federate_from_rti(_fed.trace, receive_PTAG, _lf_my_fed_id, &PTAG); + // Set up the server_fd fields. + server_fd.sin_family = AF_INET; // IPv4 + server_fd.sin_addr = host_ip_addr; // Received from the RTI - // Note: it is important that last_known_status_tag of ports does not - // get updated to a PTAG value because a PTAG does not indicate that - // the RTI knows about the status of all ports up to and _including_ - // the value of PTAG. Only a TAG message indicates that. - lf_mutex_lock(&env->mutex); + // Convert the port number from host byte order to network byte order. + server_fd.sin_port = htons(uport); + result = connect( + socket_id, + (struct sockaddr *)&server_fd, + sizeof(server_fd)); - // Sanity check - if (lf_tag_compare(PTAG, _fed.last_TAG) < 0 - || (lf_tag_compare(PTAG, _fed.last_TAG) == 0 && !_fed.is_last_TAG_provisional)) { - lf_mutex_unlock(&env->mutex); - lf_print_error_and_exit("Received a PTAG " PRINTF_TAG " that is equal or earlier " - "than an already received TAG " PRINTF_TAG ".", - PTAG.time, PTAG.microstep, - _fed.last_TAG.time, _fed.last_TAG.microstep); - } + if (result != 0) { + lf_print_error("Failed to connect to federate %d on port %d.", remote_federate_id, uport); - _fed.last_TAG = PTAG; - _fed.is_last_TAG_provisional = true; - LF_PRINT_LOG("At tag " PRINTF_TAG ", received Provisional Tag Advance Grant (PTAG): " PRINTF_TAG ".", - env->current_tag.time - start_time, env->current_tag.microstep, - _fed.last_TAG.time - start_time, _fed.last_TAG.microstep); + // Try again after some time if the connection failed. + // Note that this should not really happen since the remote federate should be + // accepting socket connections. But possibly it will be busy (in process of accepting + // another socket connection?). Hence, we retry. + count_retries++; + if (count_retries > CONNECT_MAX_RETRIES) { + // If the remote federate is not accepting the connection after CONNECT_MAX_RETRIES + // treat it as a soft error condition and return. + lf_print_error("Failed to connect to federate %d after %d retries. Giving up.", + remote_federate_id, CONNECT_MAX_RETRIES); + return; + } + lf_print_warning("Could not connect to federate %d. Will try again every %lld nanoseconds.\n", + remote_federate_id, ADDRESS_QUERY_RETRY_INTERVAL); + + // Check whether the RTI is still there. + if (rti_failed()) break; - // Even if we don't modify the event queue, we need to broadcast a change - // because we do not need to continue to wait for a TAG. - lf_cond_broadcast(&env->event_q_changed); - // Notify level advance thread which is blocked. - update_max_level(_fed.last_TAG, _fed.is_last_TAG_provisional); - lf_cond_broadcast(&port_status_changed); + // Wait ADDRESS_QUERY_RETRY_INTERVAL nanoseconds. + lf_sleep(ADDRESS_QUERY_RETRY_INTERVAL); + } else { + // Connect was successful. + size_t buffer_length = 1 + sizeof(uint16_t) + 1; + unsigned char buffer[buffer_length]; + buffer[0] = MSG_TYPE_P2P_SENDING_FED_ID; + if (_lf_my_fed_id > UINT16_MAX) { + // This error is very unlikely to occur. + lf_print_error_and_exit("Too many federates! More than %d.", UINT16_MAX); + } + encode_uint16((uint16_t)_lf_my_fed_id, (unsigned char*)&(buffer[1])); + unsigned char federation_id_length = (unsigned char)strnlen(federation_metadata.federation_id, 255); + buffer[sizeof(uint16_t) + 1] = federation_id_length; + // Trace the event when tracing is enabled + tracepoint_federate_to_federate(_fed.trace, send_FED_ID, _lf_my_fed_id, remote_federate_id, NULL); + + // No need for a mutex because we have the only handle on the socket. + write_to_socket_fail_on_error(&socket_id, + buffer_length, buffer, NULL, + "Failed to send fed_id to federate %d.", remote_federate_id); + write_to_socket_fail_on_error(&socket_id, + federation_id_length, (unsigned char*)federation_metadata.federation_id, NULL, + "Failed to send federation id to federate %d.", + remote_federate_id); - // Possibly insert a dummy event into the event queue if current time is behind - // (which it should be). Do not do this if the federate has not fully - // started yet. + read_from_socket_fail_on_error(&socket_id, 1, (unsigned char*)buffer, NULL, + "Failed to read MSG_TYPE_ACK from federate %d in response to sending fed_id.", + remote_federate_id); + if (buffer[0] != MSG_TYPE_ACK) { + // Get the error code. + read_from_socket_fail_on_error(&socket_id, 1, (unsigned char*)buffer, NULL, + "Failed to read error code from federate %d in response to sending fed_id.", remote_federate_id); + lf_print_error("Received MSG_TYPE_REJECT message from remote federate (%d).", buffer[0]); + result = -1; + continue; + } else { + lf_print("Connected to federate %d, port %d.", remote_federate_id, port); + // Trace the event when tracing is enabled + tracepoint_federate_to_federate(_fed.trace, receive_ACK, _lf_my_fed_id, remote_federate_id, NULL); + } + } + } + // Once we set this variable, then all future calls to close() on this + // socket ID should reset it to -1 within a critical section. + _fed.sockets_for_outbound_p2p_connections[remote_federate_id] = socket_id; +} - instant_t dummy_event_time = PTAG.time; - microstep_t dummy_event_relative_microstep = PTAG.microstep; +void lf_connect_to_rti(const char* hostname, int port) { + LF_PRINT_LOG("Connecting to the RTI."); - if (lf_tag_compare(env->current_tag, PTAG) == 0) { - // The current tag can equal the PTAG if we are at the start time - // or if this federate has been able to advance time to the current - // tag (e.g., it has no upstream federates). In either case, either - // it is already treating the current tag as PTAG cycle (e.g. at the - // start time) or it will be completing the current cycle and sending - // a LTC message shortly. In either case, there is nothing more to do. - lf_mutex_unlock(&env->mutex); - return; - } else if (lf_tag_compare(env->current_tag, PTAG) > 0) { - // Current tag is greater than the PTAG. - // It could be that we have sent an LTC that crossed with the incoming - // PTAG or that we have advanced to a tag greater than the PTAG. - // In the former case, there is nothing more to do. - // In the latter case, we may be blocked processing a PTAG cycle at - // a greater tag or we may be in the middle of processing a regular - // TAG. In either case, we know that at the PTAG tag, all outputs - // have either been sent or are absent, so we can send an LTC. - // Send an LTC to indicate absent outputs. - _lf_logical_tag_complete(PTAG); - // Nothing more to do. - lf_mutex_unlock(&env->mutex); - return; - } else if (PTAG.time == env->current_tag.time) { - // We now know env->current_tag < PTAG, but the times are equal. - // Adjust the microstep for scheduling the dummy event. - dummy_event_relative_microstep -= env->current_tag.microstep; - } - // We now know env->current_tag < PTAG. + // Override passed hostname and port if passed as runtime arguments. + hostname = federation_metadata.rti_host ? federation_metadata.rti_host : hostname; + port = federation_metadata.rti_port >= 0 ? federation_metadata.rti_port : port; - if (dummy_event_time != FOREVER) { - // Schedule a dummy event at the specified time and (relative) microstep. - LF_PRINT_DEBUG("At tag " PRINTF_TAG ", inserting into the event queue a dummy event " - "with time " PRINTF_TIME " and (relative) microstep " PRINTF_MICROSTEP ".", - env->current_tag.time - start_time, env->current_tag.microstep, - dummy_event_time - start_time, dummy_event_relative_microstep); - // Dummy event points to a NULL trigger and NULL real event. - event_t* dummy = _lf_create_dummy_events(env, - NULL, dummy_event_time, NULL, dummy_event_relative_microstep); - pqueue_insert(env->event_q, dummy); + // Adjust the port. + uint16_t uport = 0; + if (port < 0 || port > INT16_MAX) { + lf_print_error( + "lf_connect_to_rti(): Specified port (%d) is out of range," + " using the default port %d instead.", + port, DEFAULT_PORT + ); + uport = DEFAULT_PORT; + port = 0; // Mark so that increments occur between tries. + } else { + uport = (uint16_t)port; + } + if (uport == 0) { + uport = DEFAULT_PORT; } - lf_mutex_unlock(&env->mutex); -} + // Create a socket + _fed.socket_TCP_RTI = create_real_time_tcp_socket_errexit(); + + int result = -1; + int count_retries = 0; + struct addrinfo* res = NULL; + + while (count_retries++ < CONNECT_MAX_RETRIES && !_lf_termination_executed) { + if (res != NULL) { + // This is a repeated attempt. + if (_fed.socket_TCP_RTI >= 0) close_rti_socket(); + + lf_sleep(CONNECT_RETRY_INTERVAL); + + // Create a new socket. + _fed.socket_TCP_RTI = create_real_time_tcp_socket_errexit(); + + if (port == 0) { + // Free previously allocated address info. + freeaddrinfo(res); + // Increment the port number. + uport++; + if (uport >= DEFAULT_PORT + MAX_NUM_PORT_ADDRESSES) uport = DEFAULT_PORT; + + // Reconstruct the address info. + rti_address(hostname, uport, &res); + } + lf_print("Trying RTI again on port %d (attempt %d).", uport, count_retries); + } else { + // This is the first attempt. + rti_address(hostname, uport, &res); + } -/** - * Send a MSG_TYPE_STOP_REQUEST message to the RTI with payload equal - * to the specified tag plus one microstep. If this federate has previously - * received a stop request from the RTI, then do not send the message and - * return 1. Return -1 if the socket is disconnected. Otherwise, return 0. - * @return 0 if the message is sent. - */ -int _lf_fd_send_stop_request_to_rti(tag_t stop_tag) { + result = connect(_fed.socket_TCP_RTI, res->ai_addr, res->ai_addrlen); + if (result < 0) continue; // Connect failed. - // Send a stop request with the specified tag to the RTI - unsigned char buffer[MSG_TYPE_STOP_REQUEST_LENGTH]; - // Stop at the next microstep - ENCODE_STOP_REQUEST(buffer, stop_tag.time, stop_tag.microstep + 1); + // Have connected to an RTI, but not sure it's the right RTI. + // Send a MSG_TYPE_FED_IDS message and wait for a reply. + // Notify the RTI of the ID of this federate and its federation. - lf_mutex_lock(&outbound_socket_mutex); - // Do not send a stop request if a stop request has been previously received from the RTI. - if (!_fed.received_stop_request_from_rti) { - LF_PRINT_LOG("Sending to RTI a MSG_TYPE_STOP_REQUEST message with tag " PRINTF_TAG ".", - stop_tag.time - start_time, - stop_tag.microstep); +#ifdef FEDERATED_AUTHENTICATED + LF_PRINT_LOG("Connected to an RTI. Performing HMAC-based authentication using federation ID."); + if (perform_hmac_authentication()) { + if (port == 0) { + continue; // Try again with a new port. + } else { + // No point in trying again because it will be the same port. + close_rti_socket(); + lf_print_error_and_exit("Authentication failed."); + } + } +#else + LF_PRINT_LOG("Connected to an RTI. Sending federation ID for authentication."); +#endif - if (_fed.socket_TCP_RTI < 0) { - lf_print_warning("Socket is no longer connected. Dropping message."); - lf_mutex_unlock(&outbound_socket_mutex); - return -1; + // Send the message type first. + unsigned char buffer[4]; + buffer[0] = MSG_TYPE_FED_IDS; + // Next send the federate ID. + if (_lf_my_fed_id > UINT16_MAX) { + lf_print_error_and_exit("Too many federates! More than %d.", UINT16_MAX); } + encode_uint16((uint16_t)_lf_my_fed_id, &buffer[1]); + // Next send the federation ID length. + // The federation ID is limited to 255 bytes. + size_t federation_id_length = strnlen(federation_metadata.federation_id, 255); + buffer[1 + sizeof(uint16_t)] = (unsigned char)(federation_id_length & 0xff); + // Trace the event when tracing is enabled - tracepoint_federate_to_rti(_fed.trace, send_STOP_REQ, _lf_my_fed_id, &stop_tag); - write_to_socket_with_mutex(_fed.socket_TCP_RTI, MSG_TYPE_STOP_REQUEST_LENGTH, - buffer, &outbound_socket_mutex, - "Failed to send stop time " PRINTF_TIME " to the RTI.", stop_tag.time - start_time); - lf_mutex_unlock(&outbound_socket_mutex); - return 0; - } else { - lf_mutex_unlock(&outbound_socket_mutex); - return 1; - } -} + tracepoint_federate_to_rti(_fed.trace, send_FED_ID, _lf_my_fed_id, NULL); -/** - * Handle a MSG_TYPE_STOP_GRANTED message from the RTI. - * - * This function removes the global barrier on - * logical time raised when lf_request_stop() was - * called in the environment for each enclave. - */ -void handle_stop_granted_message() { + // No need for a mutex here because no other threads are writing to this socket. + if (write_to_socket(_fed.socket_TCP_RTI, 2 + sizeof(uint16_t), buffer)) { + continue; // Try again, possibly on a new port. + } - size_t bytes_to_read = MSG_TYPE_STOP_GRANTED_LENGTH - 1; - unsigned char buffer[bytes_to_read]; - read_from_socket_errexit(_fed.socket_TCP_RTI, bytes_to_read, buffer, - "Failed to read stop granted from RTI."); + // Next send the federation ID itself. + if (write_to_socket( + _fed.socket_TCP_RTI, + federation_id_length, + (unsigned char*)federation_metadata.federation_id)) { + continue; // Try again. + } - tag_t received_stop_tag = extract_tag(buffer); + // Wait for a response. + // The response will be MSG_TYPE_REJECT if the federation ID doesn't match. + // Otherwise, it will be either MSG_TYPE_ACK or MSG_TYPE_UDP_PORT, where the latter + // is used if clock synchronization will be performed. + unsigned char response; - // Trace the event when tracing is enabled - tracepoint_federate_from_rti(_fed.trace, receive_STOP_GRN, _lf_my_fed_id, &received_stop_tag); + LF_PRINT_DEBUG("Waiting for response to federation ID from the RTI."); - LF_PRINT_LOG("Received from RTI a MSG_TYPE_STOP_GRANTED message with elapsed tag " PRINTF_TAG ".", - received_stop_tag.time - start_time, received_stop_tag.microstep); + if (read_from_socket(_fed.socket_TCP_RTI, 1, &response)) { + continue; // Try again. + } + if (response == MSG_TYPE_REJECT) { + // Trace the event when tracing is enabled + tracepoint_federate_from_rti(_fed.trace, receive_REJECT, _lf_my_fed_id, NULL); + // Read one more byte to determine the cause of rejection. + unsigned char cause; + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, 1, &cause, NULL, + "Failed to read the cause of rejection by the RTI."); + if (cause == FEDERATION_ID_DOES_NOT_MATCH || cause == WRONG_SERVER) { + lf_print_warning("Connected to the wrong RTI on port %d. Will try again", uport); + continue; + } + } else if (response == MSG_TYPE_ACK) { + // Trace the event when tracing is enabled + tracepoint_federate_from_rti(_fed.trace, receive_ACK, _lf_my_fed_id, NULL); + LF_PRINT_LOG("Received acknowledgment from the RTI."); + break; + } else if (response == MSG_TYPE_RESIGN) { + lf_print_warning("RTI on port %d resigned. Will try again", uport); + continue; + } else { + lf_print_warning("RTI on port %d gave unexpect response %u. Will try again", uport, response); + continue; + } + } + if (result < 0) { + lf_print_error_and_exit("Failed to connect to RTI after %d tries.", CONNECT_MAX_RETRIES); + } - environment_t *env; - int num_environments = _lf_get_environments(&env); + freeaddrinfo(res); /* No longer needed */ - for (int i = 0; i < num_environments; i++) { - lf_mutex_lock(&env[i].mutex); + // Call a generated (external) function that sends information + // about connections between this federate and other federates + // where messages are routed through the RTI. + // @see MSG_TYPE_NEIGHBOR_STRUCTURE in net_common.h + lf_send_neighbor_structure_to_RTI(_fed.socket_TCP_RTI); - // Sanity check. - if (lf_tag_compare(received_stop_tag, env[i].current_tag) <= 0) { - lf_print_error("RTI granted a MSG_TYPE_STOP_GRANTED tag that is equal to or less than this federate's current tag " PRINTF_TAG ". " - "Stopping at the next microstep instead.", - env[i].current_tag.time - start_time, env[i].current_tag.microstep); - received_stop_tag = env[i].current_tag; - received_stop_tag.microstep++; - } + uint16_t udp_port = setup_clock_synchronization_with_rti(); - _lf_set_stop_tag(&env[i], received_stop_tag); - LF_PRINT_DEBUG("Setting the stop tag to " PRINTF_TAG ".", - env[i].stop_tag.time - start_time, - env[i].stop_tag.microstep); + // Write the returned port number to the RTI + unsigned char UDP_port_number[1 + sizeof(uint16_t)]; + UDP_port_number[0] = MSG_TYPE_UDP_PORT; + encode_uint16(udp_port, &(UDP_port_number[1])); + write_to_socket_fail_on_error(&_fed.socket_TCP_RTI, 1 + sizeof(uint16_t), UDP_port_number, NULL, + "Failed to send the UDP port number to the RTI."); - if (env[i].barrier.requestors) _lf_decrement_tag_barrier_locked(&env[i]); - // We signal instead of broadcast under the assumption that only - // one worker thread can call wait_until at a given time because - // the call to wait_until is protected by a mutex lock - lf_cond_signal(&env[i].event_q_changed); - lf_mutex_unlock(&env[i].mutex); - } + lf_print("Connected to RTI at %s:%d.", hostname, uport); } -/** - * Handle a MSG_TYPE_STOP_REQUEST message from the RTI. - */ -void handle_stop_request_message() { - size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_LENGTH - 1; - unsigned char buffer[bytes_to_read]; - read_from_socket_errexit(_fed.socket_TCP_RTI, bytes_to_read, buffer, - "Failed to read stop request from RTI."); - tag_t tag_to_stop = extract_tag(buffer); +void lf_create_server(int specified_port) { + assert(specified_port <= UINT16_MAX && specified_port >= 0); + uint16_t port = (uint16_t)specified_port; + LF_PRINT_LOG("Creating a socket server on port %d.", port); + // Create an IPv4 socket for TCP (not UDP) communication over IP (0). + int socket_descriptor = create_real_time_tcp_socket_errexit(); - // Trace the event when tracing is enabled - tracepoint_federate_from_rti(_fed.trace, receive_STOP_REQ, _lf_my_fed_id, &tag_to_stop); - LF_PRINT_LOG("Received from RTI a MSG_TYPE_STOP_REQUEST message with tag " PRINTF_TAG ".", - tag_to_stop.time - start_time, - tag_to_stop.microstep); + // Server file descriptor. + struct sockaddr_in server_fd; + // Zero out the server address structure. + bzero((char*)&server_fd, sizeof(server_fd)); - // If we have previously received from the RTI a stop request, - // or we have previously sent a stop request to the RTI, - // then we have already blocked tag advance in enclaves. - // Do not do this twice. The record of whether the first has occurred - // is guarded by the outbound socket mutex. - // The second is guarded by the global mutex. - // Note that the RTI should not send stop requests more than once to federates. - lf_mutex_lock(&outbound_socket_mutex); - bool already_blocked = false; - if (_fed.received_stop_request_from_rti) { - already_blocked = true; - } - _fed.received_stop_request_from_rti = true; - lf_mutex_unlock(&outbound_socket_mutex); + server_fd.sin_family = AF_INET; // IPv4 + server_fd.sin_addr.s_addr = INADDR_ANY; // All interfaces, 0.0.0.0. + // Convert the port number from host byte order to network byte order. + server_fd.sin_port = htons(port); - extern lf_mutex_t global_mutex; - extern bool lf_stop_requested; - lf_mutex_lock(&global_mutex); - if (lf_stop_requested) { - already_blocked = true; + int result = bind( + socket_descriptor, + (struct sockaddr *) &server_fd, + sizeof(server_fd)); + int count = 0; + while (result < 0 && count++ < PORT_BIND_RETRY_LIMIT) { + lf_sleep(PORT_BIND_RETRY_INTERVAL); + result = bind( + socket_descriptor, + (struct sockaddr *) &server_fd, + sizeof(server_fd)); + } + if (result < 0) { + lf_print_error_and_exit("Failed to bind socket on port %d.", port); } - lf_mutex_unlock(&global_mutex); - - // Iterate over the scheduling enclaves to find the maximum current tag - // and adjust the tag_to_stop if any of those is greater than tag_to_stop. - // If not done previously, block tag advance in the enclave. - environment_t *env; - int num_environments = _lf_get_environments(&env); - for (int i = 0; i < num_environments; i++) { - lf_mutex_lock(&env[i].mutex); - if (lf_tag_compare(tag_to_stop, env[i].current_tag) <= 0) { - // Can't stop at the requested tag. Make a counteroffer. - tag_to_stop = env->current_tag; - tag_to_stop.microstep++; - } - if (!already_blocked) { - // Set a barrier to prevent the enclave from advancing past the so-far tag to stop. - _lf_increment_tag_barrier_locked(&env[i], tag_to_stop); + // Set the global server port. + if (specified_port == 0) { + // Need to retrieve the port number assigned by the OS. + struct sockaddr_in assigned; + socklen_t addr_len = sizeof(assigned); + if (getsockname(socket_descriptor, (struct sockaddr *) &assigned, &addr_len) < 0) { + lf_print_error_and_exit("Failed to retrieve assigned port number."); } - lf_mutex_unlock(&env[i].mutex); + _fed.server_port = ntohs(assigned.sin_port); + } else { + _fed.server_port = port; } - // Send the reply, which is the least tag at which we can stop. - unsigned char outgoing_buffer[MSG_TYPE_STOP_REQUEST_REPLY_LENGTH]; - ENCODE_STOP_REQUEST_REPLY(outgoing_buffer, tag_to_stop.time, tag_to_stop.microstep); - lf_mutex_lock(&outbound_socket_mutex); - if (_fed.socket_TCP_RTI < 0) { - lf_print_warning("Socket is no longer connected. Dropping message."); - lf_mutex_unlock(&outbound_socket_mutex); - return; - } + // Enable listening for socket connections. + // The second argument is the maximum number of queued socket requests, + // which according to the Mac man page is limited to 128. + listen(socket_descriptor, 128); + + LF_PRINT_LOG("Server for communicating with other federates started using port %d.", _fed.server_port); + + // Send the server port number to the RTI + // on an MSG_TYPE_ADDRESS_ADVERTISEMENT message (@see net_common.h). + unsigned char buffer[sizeof(int32_t) + 1]; + buffer[0] = MSG_TYPE_ADDRESS_ADVERTISEMENT; + encode_int32(_fed.server_port, &(buffer[1])); + // Trace the event when tracing is enabled - tracepoint_federate_to_rti(_fed.trace, send_STOP_REQ_REP, _lf_my_fed_id, &tag_to_stop); - // Send the current logical time to the RTI. This message does not have an identifying byte - // since the RTI is waiting for a response from this federate. - write_to_socket_with_mutex( - _fed.socket_TCP_RTI, MSG_TYPE_STOP_REQUEST_REPLY_LENGTH, outgoing_buffer, &outbound_socket_mutex, - "Failed to send the answer to MSG_TYPE_STOP_REQUEST to RTI."); - lf_mutex_unlock(&outbound_socket_mutex); + tracepoint_federate_to_rti(_fed.trace, send_ADR_AD, _lf_my_fed_id, NULL); + + // No need for a mutex because we have the only handle on this socket. + write_to_socket_fail_on_error(&_fed.socket_TCP_RTI, sizeof(int32_t) + 1, (unsigned char*)buffer, NULL, + "Failed to send address advertisement."); + + LF_PRINT_DEBUG("Sent port %d to the RTI.", _fed.server_port); + + // Set the global server socket + _fed.server_socket = socket_descriptor; } -/** - * Close sockets used to communicate with other federates, if they are open, - * and send a MSG_TYPE_RESIGN message to the RTI. This implements the function - * defined in reactor.h. For unfederated execution, the code generator - * generates an empty implementation. - * @param env The environment of the federate - */ -void terminate_execution(environment_t* env) { +void lf_enqueue_port_absent_reactions(environment_t* env){ assert(env != GLOBAL_ENVIRONMENT); - - // Check for all outgoing physical connections in - // _fed.sockets_for_outbound_p2p_connections and - // if the socket ID is not -1, the connection is still open. - // Send an EOF by closing the socket here. - // NOTE: It is dangerous to acquire a mutex in a termination - // process because it can block program exit if a deadlock occurs. - // Hence, it is paramount that these mutexes not allow for any - // possibility of deadlock. To ensure this, this - // function should NEVER be called while holding any mutex lock. - lf_mutex_lock(&outbound_socket_mutex); - for (int i=0; i < NUMBER_OF_FEDERATES; i++) { - // Close outbound connections, in case they have not closed themselves. - // This will result in EOF being sent to the remote federate, I think. - _lf_close_outbound_socket(i); +#ifdef FEDERATED_CENTRALIZED + if (!_fed.has_downstream) { + // This federate is not connected to any downstream federates via a + // logical connection. No need to trigger port absent + // reactions. + return; } - // Resign the federation, which will close the socket to the RTI. - if (_fed.socket_TCP_RTI >= 0) { - size_t bytes_to_write = 1 + sizeof(tag_t); - unsigned char buffer[bytes_to_write]; - buffer[0] = MSG_TYPE_RESIGN; - tag_t tag = env->current_tag; - encode_tag(&(buffer[1]), tag); - // Trace the event when tracing is enabled - tracepoint_federate_to_rti(_fed.trace, send_RESIGN, _lf_my_fed_id, &tag); - ssize_t written = write_to_socket(_fed.socket_TCP_RTI, bytes_to_write, &(buffer[0])); - if (written == bytes_to_write) { - LF_PRINT_LOG("Resigned."); +#endif + LF_PRINT_DEBUG("Enqueueing port absent reactions at time %lld.", (long long) (env->current_tag.time - start_time)); + if (num_port_absent_reactions == 0) { + LF_PRINT_DEBUG("No port absent reactions."); + return; + } + for (int i = 0; i < num_port_absent_reactions; i++) { + reaction_t* reaction = port_absent_reaction[i]; + if (reaction && reaction->status == inactive) { + LF_PRINT_DEBUG("Inserting port absent reaction on reaction queue."); + lf_scheduler_trigger_reaction(env->scheduler, reaction, -1); } } - lf_mutex_unlock(&outbound_socket_mutex); +} - LF_PRINT_DEBUG("Requesting closing of incoming P2P sockets."); - // Request closing the incoming P2P sockets. - for (int i=0; i < NUMBER_OF_FEDERATES; i++) { - if (_lf_request_close_inbound_socket(i) == 0) { - // Sending the close request failed. Mark the socket closed. - _fed.sockets_for_inbound_p2p_connections[i] = -1; +void* lf_handle_p2p_connections_from_federates(void* env_arg) { + assert(env_arg); + environment_t* env = (environment_t *) env_arg; + int received_federates = 0; + // Allocate memory to store thread IDs. + _fed.inbound_socket_listeners = (lf_thread_t*)calloc(_fed.number_of_inbound_p2p_connections, sizeof(lf_thread_t)); + while (received_federates < _fed.number_of_inbound_p2p_connections && !_lf_termination_executed) { + // Wait for an incoming connection request. + struct sockaddr client_fd; + uint32_t client_length = sizeof(client_fd); + int socket_id = accept(_fed.server_socket, &client_fd, &client_length); + + if (socket_id < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { + if (rti_failed()) break; + else continue; // Try again. + } else if (errno == EPERM) { + lf_print_error_system_failure("Firewall permissions prohibit connection."); + } else { + lf_print_error_system_failure("A fatal error occurred while accepting a new socket."); + } } - } + LF_PRINT_LOG("Accepted new connection from remote federate."); - LF_PRINT_DEBUG("Waiting for inbound p2p socket listener threads."); - // Wait for each inbound socket listener thread to close. - if (_fed.number_of_inbound_p2p_connections > 0) { - LF_PRINT_LOG("Waiting for %zu threads listening for incoming messages to exit.", - _fed.number_of_inbound_p2p_connections); - for (int i=0; i < _fed.number_of_inbound_p2p_connections; i++) { - // Ignoring errors here. - lf_thread_join(_fed.inbound_socket_listeners[i], NULL); + size_t header_length = 1 + sizeof(uint16_t) + 1; + unsigned char buffer[header_length]; + int read_failed = read_from_socket(socket_id, header_length, (unsigned char*)&buffer); + if (read_failed || buffer[0] != MSG_TYPE_P2P_SENDING_FED_ID) { + lf_print_warning("Federate received invalid first message on P2P socket. Closing socket."); + if (read_failed == 0) { + // Wrong message received. + unsigned char response[2]; + response[0] = MSG_TYPE_REJECT; + response[1] = WRONG_SERVER; + // Trace the event when tracing is enabled + tracepoint_federate_to_federate(_fed.trace, send_REJECT, _lf_my_fed_id, -3, NULL); + // Ignore errors on this response. + write_to_socket(socket_id, 2, response); + } + close(socket_id); + continue; } - } - LF_PRINT_DEBUG("Waiting for RTI's socket listener threads."); - // Wait for the thread listening for messages from the RTI to close. - lf_thread_join(_fed.RTI_socket_listener, NULL); + // Get the federation ID and check it. + unsigned char federation_id_length = buffer[header_length - 1]; + char remote_federation_id[federation_id_length]; + read_failed = read_from_socket(socket_id, federation_id_length, (unsigned char*)remote_federation_id); + if (read_failed || (strncmp(federation_metadata.federation_id, remote_federation_id, strnlen(federation_metadata.federation_id, 255)) != 0)) { + lf_print_warning("Received invalid federation ID. Closing socket."); + if (read_failed == 0) { + unsigned char response[2]; + response[0] = MSG_TYPE_REJECT; + response[1] = FEDERATION_ID_DOES_NOT_MATCH; + // Trace the event when tracing is enabled + tracepoint_federate_to_federate(_fed.trace, send_REJECT, _lf_my_fed_id, -3, NULL); + // Ignore errors on this response. + write_to_socket(socket_id, 2, response); + } + close(socket_id); + continue; + } - LF_PRINT_DEBUG("Freeing memory occupied by the federate."); - free(_fed.inbound_socket_listeners); - free(federation_metadata.rti_host); - free(federation_metadata.rti_user); -} + // Extract the ID of the sending federate. + uint16_t remote_fed_id = extract_uint16((unsigned char*)&(buffer[1])); + LF_PRINT_DEBUG("Received sending federate ID %d.", remote_fed_id); + + // Trace the event when tracing is enabled + tracepoint_federate_to_federate(_fed.trace, receive_FED_ID, _lf_my_fed_id, remote_fed_id, NULL); -/** - * Thread that listens for inputs from other federates. - * This thread listens for messages of type MSG_TYPE_P2P_MESSAGE, - * MSG_TYPE_P2P_TAGGED_MESSAGE, or MSG_TYPE_PORT_ABSENT (@see net_common.h) from the specified - * peer federate and calls the appropriate handling function for - * each message type. If an error occurs or an EOF is received - * from the peer, then this procedure sets the corresponding - * socket in _fed.sockets_for_inbound_p2p_connections - * to -1 and returns, terminating the thread. - * @param _args The remote federate ID (cast to void*). - * @param fed_id_ptr A pointer to a uint16_t containing federate ID being listened to. - * This procedure frees the memory pointed to before returning. - */ -void* listen_to_federates(void* _args) { - uint16_t fed_id = (uint16_t)(uintptr_t)_args; + // Once we record the socket_id here, all future calls to close() on + // the socket should be done while holding the socket_mutex, and this array + // element should be reset to -1 during that critical section. + // Otherwise, there can be race condition where, during termination, + // two threads attempt to simultaneously close the socket. + _fed.sockets_for_inbound_p2p_connections[remote_fed_id] = socket_id; - LF_PRINT_LOG("Listening to federate %d.", fed_id); + // Send an MSG_TYPE_ACK message. + unsigned char response = MSG_TYPE_ACK; - int socket_id = _fed.sockets_for_inbound_p2p_connections[fed_id]; + // Trace the event when tracing is enabled + tracepoint_federate_to_federate(_fed.trace, send_ACK, _lf_my_fed_id, remote_fed_id, NULL); - // Buffer for incoming messages. - // This does not constrain the message size - // because the message will be put into malloc'd memory. - unsigned char buffer[FED_COM_BUFFER_SIZE]; + LF_MUTEX_LOCK(lf_outbound_socket_mutex); + write_to_socket_fail_on_error( + &_fed.sockets_for_inbound_p2p_connections[remote_fed_id], + 1, (unsigned char*)&response, + &lf_outbound_socket_mutex, + "Failed to write MSG_TYPE_ACK in response to federate %d.", + remote_fed_id); + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); - // Listen for messages from the federate. - while (1) { - // Read one byte to get the message type. - LF_PRINT_DEBUG("Waiting for a P2P message on socket %d.", socket_id); - ssize_t bytes_read = read_from_socket(socket_id, 1, buffer); - if (bytes_read == 0) { - // EOF occurred. This breaks the connection. - lf_print("Received EOF from peer federate %d. Closing the socket.", fed_id); - _lf_close_inbound_socket(fed_id); - break; - } else if (bytes_read < 0) { - lf_print_error("P2P socket to federate %d is broken.", fed_id); - _lf_close_inbound_socket(fed_id); - break; - } - LF_PRINT_DEBUG("Received a P2P message on socket %d of type %d.", - socket_id, buffer[0]); - bool bad_message = false; - switch (buffer[0]) { - case MSG_TYPE_P2P_MESSAGE: - LF_PRINT_LOG("Received untimed message from federate %d.", fed_id); - handle_message(socket_id, fed_id); - break; - case MSG_TYPE_P2P_TAGGED_MESSAGE: - LF_PRINT_LOG("Received timed message from federate %d.", fed_id); - handle_tagged_message(socket_id, fed_id); - break; - case MSG_TYPE_PORT_ABSENT: - LF_PRINT_LOG("Received port absent message from federate %d.", fed_id); - handle_port_absent_message(socket_id, fed_id); - break; - default: - bad_message = true; - } - if (bad_message) { - // FIXME: Better error handling needed. - lf_print_error("Received erroneous message type: %d. Closing the socket.", buffer[0]); - break; - // Trace the event when tracing is enabled - tracepoint_federate_from_federate(_fed.trace, receive_UNIDENTIFIED, _lf_my_fed_id, fed_id, NULL); + // Start a thread to listen for incoming messages from other federates. + // The fed_id is a uint16_t, which we assume can be safely cast to and from void*. + void* fed_id_arg = (void*)(uintptr_t)remote_fed_id; + int result = lf_thread_create( + &_fed.inbound_socket_listeners[received_federates], + listen_to_federates, + fed_id_arg); + if (result != 0) { + // Failed to create a listening thread. + LF_MUTEX_LOCK(socket_mutex); + if (_fed.sockets_for_inbound_p2p_connections[remote_fed_id] != -1) { + close(socket_id); + _fed.sockets_for_inbound_p2p_connections[remote_fed_id] = -1; + } + LF_MUTEX_UNLOCK(socket_mutex); + lf_print_error_and_exit( + "Failed to create a thread to listen for incoming physical connection. Error code: %d.", + result + ); } + + received_federates++; } + + LF_PRINT_LOG("All %zu remote federates are connected.", _fed.number_of_inbound_p2p_connections); return NULL; } -/** - * @brief Stop the traces associated with all environments in the program. - */ -static void stop_all_traces() { - environment_t *env; - int num_envs = _lf_get_environments(&env); - for (int i = 0; i < num_envs; i++) { - stop_trace(env[i].trace); +void lf_latest_tag_complete(tag_t tag_to_send) { + int compare_with_last_tag = lf_tag_compare(_fed.last_sent_LTC, tag_to_send); + if (compare_with_last_tag >= 0) { + return; } + LF_PRINT_LOG("Sending Latest Tag Complete (LTC) " PRINTF_TAG " to the RTI.", + tag_to_send.time - start_time, + tag_to_send.microstep); + send_tag(MSG_TYPE_LATEST_TAG_COMPLETE, tag_to_send); + _fed.last_sent_LTC = tag_to_send; } -/** - * Thread that listens for TCP inputs from the RTI. - * When messages arrive, this calls the appropriate handler. - * @param args Ignored - */ -void* listen_to_rti_TCP(void* args) { - // Buffer for incoming messages. - // This does not constrain the message size - // because the message will be put into malloc'd memory. - unsigned char buffer[FED_COM_BUFFER_SIZE]; - - // Listen for messages from the federate. - while (1) { - // Check whether the RTI socket is still valid - if (_fed.socket_TCP_RTI < 0) { - lf_print_warning("Socket to the RTI unexpectedly closed."); - return NULL; +parse_rti_code_t lf_parse_rti_addr(const char* rti_addr) { + bool has_host = false, has_port = false, has_user = false; + rti_addr_info_t rti_addr_info = {0}; + extract_rti_addr_info(rti_addr, &rti_addr_info); + if (!rti_addr_info.has_host && !rti_addr_info.has_port && !rti_addr_info.has_user) { + return FAILED_TO_PARSE; + } + if (rti_addr_info.has_host) { + if (validate_host(rti_addr_info.rti_host_str)) { + char* rti_host = (char*) calloc(256, sizeof(char)); + strncpy(rti_host, rti_addr_info.rti_host_str, 255); + federation_metadata.rti_host = rti_host; + } else { + return INVALID_HOST; } - // Read one byte to get the message type. - // This will exit if the read fails. - ssize_t bytes_read = read_from_socket(_fed.socket_TCP_RTI, 1, buffer); - if (bytes_read < 0) { - if (errno == ECONNRESET) { - lf_print_error("Socket connection to the RTI was closed by the RTI without" - " properly sending an EOF first. Considering this a soft error."); - // FIXME: If this happens, possibly a new RTI must be elected. - _fed.socket_TCP_RTI = -1; - return NULL; - } else { - lf_print_error("Socket connection to the RTI has been broken" - " with error %d: %s. The RTI should" - " close connections with an EOF first." - " Considering this a soft error.", - errno, - strerror(errno)); - // FIXME: If this happens, possibly a new RTI must be elected. - _fed.socket_TCP_RTI = -1; - return NULL; - } - } else if (bytes_read == 0) { - // EOF received. - lf_print("Connection to the RTI closed with an EOF."); - _fed.socket_TCP_RTI = -1; - stop_all_traces(); - return NULL; + } + if (rti_addr_info.has_port) { + if (validate_port(rti_addr_info.rti_port_str)) { + federation_metadata.rti_port = atoi(rti_addr_info.rti_port_str); + } else { + return INVALID_PORT; } - switch (buffer[0]) { - case MSG_TYPE_TAGGED_MESSAGE: - handle_tagged_message(_fed.socket_TCP_RTI, -1); - break; - case MSG_TYPE_TAG_ADVANCE_GRANT: - handle_tag_advance_grant(); - break; - case MSG_TYPE_PROVISIONAL_TAG_ADVANCE_GRANT: - handle_provisional_tag_advance_grant(); - break; - case MSG_TYPE_STOP_REQUEST: - handle_stop_request_message(); - break; - case MSG_TYPE_STOP_GRANTED: - handle_stop_granted_message(); - break; - case MSG_TYPE_PORT_ABSENT: - handle_port_absent_message(_fed.socket_TCP_RTI, -1); - break; - case MSG_TYPE_CLOCK_SYNC_T1: - case MSG_TYPE_CLOCK_SYNC_T4: - lf_print_error("Federate %d received unexpected clock sync message from RTI on TCP socket.", - _lf_my_fed_id); - break; - default: - lf_print_error_and_exit("Received from RTI an unrecognized TCP message type: %hhx.", buffer[0]); - // Trace the event when tracing is enabled - tracepoint_federate_from_rti(_fed.trace, receive_UNIDENTIFIED, _lf_my_fed_id, NULL); - } } - return NULL; + if (rti_addr_info.has_user) { + if (validate_user(rti_addr_info.rti_user_str)) { + char* rti_user = (char*) calloc(256, sizeof(char)); + strncpy(rti_user, rti_addr_info.rti_user_str, 255); + federation_metadata.rti_user = rti_user; + } else { + return INVALID_USER; + } + } + return SUCCESS; } -void synchronize_with_other_federates(void) { - - LF_PRINT_DEBUG("Synchronizing with other federates."); - - // Reset the start time to the coordinated start time for all federates. - // Note that this does not grant execution to this federate. - start_time = get_start_time_from_rti(lf_time_physical()); - - // Start a thread to listen for incoming TCP messages from the RTI. - // @note Up until this point, the federate has been listening for messages - // from the RTI in a sequential manner in the main thread. From now on, a - // separate thread is created to allow for asynchronous communication. - lf_thread_create(&_fed.RTI_socket_listener, listen_to_rti_TCP, NULL); - lf_thread_t thread_id; - if (create_clock_sync_thread(&thread_id)) { - lf_print_warning("Failed to create thread to handle clock synchronization."); +void lf_reset_status_fields_on_input_port_triggers() { + environment_t *env; + _lf_get_environments(&env); + tag_t now = lf_tag(env); + for (int i = 0; i < _lf_action_table_size; i++) { + if (lf_tag_compare(_lf_action_table[i]->trigger->last_known_status_tag, now) >= 0) { + set_network_port_status(i, absent); // Default may be overriden to become present. + } else { + set_network_port_status(i, unknown); + } } + LF_PRINT_DEBUG("Resetting port status fields."); + lf_update_max_level(_fed.last_TAG, _fed.is_last_TAG_provisional); + lf_cond_broadcast(&lf_port_status_changed); } -/** - * Modify the specified tag, if necessary, to be an earlier tag based - * on the current physical time. The earlier tag is necessary if this federate - * has downstream federates and also has physical actions that may trigger - * outputs. In that case, the earlier tag will be the current physical time - * plus the minimum delay on all such physical actions plus any other delays - * along the path from the triggering physical action to the output port - * minus one nanosecond. The modified tag is assured of being less than any - * output tag that might later be produced. - * @param tag A pointer to the proposed NET. - * @return True if this federate requires this modification and the tag was - * modified. - */ -bool _lf_bounded_NET(tag_t* tag) { - // The tag sent by this function is a promise that, absent - // inputs from another federate, this federate will not produce events - // earlier than t. But if there are downstream federates and there is - // a physical action (not counting receivers from upstream federates), - // then we can only promise up to current physical time (plus the minimum - // of all minimum delays on the physical actions). - // In this case, we send a NET message with the current physical time - // to permit downstream federates to advance. To avoid - // overwhelming the network, this NET message should be sent periodically - // at specified intervals controlled by the target parameter - // coordination-options: {advance-message-interval: time units}. - // The larger the interval, the more downstream federates will lag - // behind real time, but the less network traffic. If this option is - // missing, we issue a warning message suggesting that a redesign - // might be in order so that outputs don't depend on physical actions. - LF_PRINT_DEBUG("Checking NET to see whether it should be bounded by physical time." - " Min delay from physical action: " PRINTF_TIME ".", - _fed.min_delay_from_physical_action_to_federate_output); - if (_fed.min_delay_from_physical_action_to_federate_output >= 0LL - && _fed.has_downstream - ) { - // There is a physical action upstream of some output from this - // federate, and there is at least one downstream federate. - // Compare the tag to the current physical time. - instant_t physical_time = lf_time_physical(); - if (physical_time + _fed.min_delay_from_physical_action_to_federate_output < tag->time) { - // Can only promise up and not including this new time: - tag->time = physical_time + _fed.min_delay_from_physical_action_to_federate_output - 1L; - tag->microstep = 0; - LF_PRINT_LOG("Has physical actions that bound NET to " PRINTF_TAG ".", - tag->time - start_time, tag->microstep); - return true; - } +int lf_send_message(int message_type, + unsigned short port, + unsigned short federate, + const char* next_destination_str, + size_t length, + unsigned char* message) { + unsigned char header_buffer[1 + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t)]; + // First byte identifies this as a timed message. + if (message_type != MSG_TYPE_P2P_MESSAGE ) { + lf_print_error("lf_send_message: Unsupported message type (%d).", message_type); + return -1; + } + header_buffer[0] = (unsigned char)message_type; + // Next two bytes identify the destination port. + // NOTE: Send messages little endian (network order), not big endian. + encode_uint16(port, &(header_buffer[1])); + + // Next two bytes identify the destination federate. + encode_uint16(federate, &(header_buffer[1 + sizeof(uint16_t)])); + + // The next four bytes are the message length. + encode_int32((int32_t)length, &(header_buffer[1 + sizeof(uint16_t) + sizeof(uint16_t)])); + + LF_PRINT_LOG("Sending untagged message to %s.", next_destination_str); + + // Header: message_type + port_id + federate_id + length of message + timestamp + microstep + const int header_length = 1 + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t); + + // Use a mutex lock to prevent multiple threads from simultaneously sending. + LF_MUTEX_LOCK(lf_outbound_socket_mutex); + + int* socket = &_fed.sockets_for_outbound_p2p_connections[federate]; + + // Trace the event when tracing is enabled + tracepoint_federate_to_federate(_fed.trace, send_P2P_MSG, _lf_my_fed_id, federate, NULL); + + int result = write_to_socket_close_on_error(socket, header_length, header_buffer); + if (result == 0) { + // Header sent successfully. Send the body. + result = write_to_socket_close_on_error(socket, length, message); } - return false; + if (result != 0) { + // Message did not send. Since this is used for physical connections, this is not critical. + lf_print_warning("Failed to send message to %s. Dropping the message.", next_destination_str); + } + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); + return result; } -/** - * If this federate depends on upstream federates or sends data to downstream - * federates, then send to the RTI a NET, which will give the tag of the - * earliest event on the event queue, or, if the queue is empty, the timeout - * time, or, if there is no timeout, FOREVER. - * - * If there are network outputs that - * depend on physical actions, then insert a dummy event to ensure this federate - * advances its tag so that downstream federates can make progress. - * - * A NET is a promise saying that, absent network inputs, this federate will - * not produce an output message with tag earlier than the NET value. - * - * If there are upstream federates, then after sending a NET, this will block - * until either the RTI grants the advance to the requested time or the wait - * for the response from the RTI is interrupted by a change in the event queue - * (e.g., a physical action triggered or a network message arrived). - * If there are no upstream federates, then it will not wait for a TAG - * (which won't be forthcoming anyway) and returns the earliest tag on the event queue. - * - * If the federate has neither upstream nor downstream federates, then this - * returns the specified tag immediately without sending anything to the RTI. - * - * If there is at least one physical action somewhere in the federate that can - * trigger an output to a downstream federate, then the NET is required to be - * less than the current physical time. If physical time is less than the - * earliest event in the event queue (or the event queue is empty), then this - * function will insert a dummy event with a tag equal to the current physical - * time (and a microstep of 0). This will enforce advancement of tag for this - * federate and causes a NET message to be sent repeatedly as physical time - * advances with the time interval between messages controlled by the target - * parameter coordination-options: {advance-message-interval timevalue}. It will - * stop creating dummy events if and when its event queue has an event with a - * timestamp less than physical time. - * - * If wait_for_reply is false, then this function will simply send the - * specified tag and return that tag immediately. This is useful when a - * federate is shutting down and will not be sending any more messages at all. - * - * In all cases, this returns either the specified tag or - * another tag when it is safe to advance logical time to the returned tag. - * The returned tag may be less than the specified tag if there are upstream - * federates and either the RTI responds with a lesser tag or - * the wait for a response from the RTI is interrupted by a - * change in the event queue. - * - * This function is used in centralized coordination only. - * - * This function assumes the caller holds the mutex lock. - * - * @param env The environment of the federate - * @param tag The tag. - * @param wait_for_reply If true, wait for a reply. - */ -tag_t _lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply) { +tag_t lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply) { assert(env != GLOBAL_ENVIRONMENT); while (true) { if (!_fed.has_downstream && !_fed.has_upstream) { @@ -2722,7 +2419,7 @@ tag_t _lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply return _fed.last_TAG; } - // Copy the tag because _lf_bounded_NET() may modify it. + // Copy the tag because bounded_NET() may modify it. tag_t original_tag = tag; // A NET sent by this function is a promise that, absent @@ -2733,7 +2430,7 @@ tag_t _lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply // of all minimum delays on the physical actions). // If wait_for_reply is false, leave the tag alone. bool tag_bounded_by_physical_time = wait_for_reply ? - _lf_bounded_NET(&tag) + bounded_NET(&tag) : false; // What we do next depends on whether the NET has been bounded by @@ -2742,7 +2439,7 @@ tag_t _lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply // This if statement does not fall through but rather returns. // NET is not bounded by physical time or has no downstream federates. // Normal case. - _lf_send_tag(MSG_TYPE_NEXT_EVENT_TAG, tag, wait_for_reply); + send_tag(MSG_TYPE_NEXT_EVENT_TAG, tag); _fed.last_sent_NET = tag; LF_PRINT_LOG("Sent next event tag (NET) " PRINTF_TAG " to RTI.", tag.time - start_time, tag.microstep); @@ -2768,7 +2465,9 @@ tag_t _lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply while (true) { // Wait until either something changes on the event queue or // the RTI has responded with a TAG. - LF_PRINT_DEBUG("Waiting for a TAG from the RTI with _fed.last_TAG.time=%lld, %lld and net=%lld, %lld", (long long) _fed.last_TAG.time - start_time, (long long) _fed.last_TAG.microstep, (long long) tag.time - start_time, (long long) tag.microstep); + LF_PRINT_DEBUG("Waiting for a TAG from the RTI with _fed.last_TAG= " PRINTF_TAG " and net=" PRINTF_TAG, + _fed.last_TAG.time - start_time, _fed.last_TAG.microstep, + tag.time - start_time, tag.microstep); if (lf_cond_wait(&env->event_q_changed) != 0) { lf_print_error("Wait error."); } @@ -2781,7 +2480,7 @@ tag_t _lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply return _fed.last_TAG; } if (lf_tag_compare(next_tag, tag) != 0) { - _lf_send_tag(MSG_TYPE_NEXT_EVENT_TAG, next_tag, wait_for_reply); + send_tag(MSG_TYPE_NEXT_EVENT_TAG, next_tag); _fed.last_sent_NET = next_tag; LF_PRINT_LOG("Sent next event tag (NET) " PRINTF_TAG " to RTI from loop.", next_tag.time - lf_time_start(), next_tag.microstep); @@ -2790,9 +2489,10 @@ tag_t _lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply } if (tag.time != FOREVER) { - // Create a dummy event that will force this federate to advance time and subsequently enable progress for - // downstream federates. - event_t* dummy = _lf_create_dummy_events(env, NULL, tag.time, NULL, 0); + // Create a dummy event that will force this federate to advance time and subsequently + // enable progress for downstream federates. Increment the time by ADVANCE_MESSAGE_INTERVAL + // to prevent too frequent dummy events. + event_t* dummy = _lf_create_dummy_events(env, NULL, tag.time + ADVANCE_MESSAGE_INTERVAL, NULL, 0); pqueue_insert(env->event_q, dummy); } @@ -2800,7 +2500,7 @@ tag_t _lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply tag.time - lf_time_start()); if (!wait_for_reply) { - LF_PRINT_LOG("Not waiting physical time to advance further."); + LF_PRINT_LOG("Not waiting for physical time to advance further."); return tag; } @@ -2809,7 +2509,7 @@ tag_t _lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply // RTI. That amount of time will be no greater than ADVANCE_MESSAGE_INTERVAL in the future. LF_PRINT_DEBUG("Waiting for physical time to elapse or an event on the event queue."); - // The above call to _lf_bounded_NET called lf_time_physical() + // The above call to bounded_NET called lf_time_physical() // set _lf_last_reported_unadjusted_physical_time_ns, the // time obtained using CLOCK_REALTIME before adjustment for // clock synchronization. Since that is the clock used by @@ -2835,53 +2535,301 @@ tag_t _lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply } } -/** - * Parse the address of the RTI and store them into the global federation_metadata struct. - * @return a parse_rti_code_t indicating the result of the parse. - */ -parse_rti_code_t parse_rti_addr(const char* rti_addr) { - bool has_host = false, has_port = false, has_user = false; - rti_addr_info_t rti_addr_info = {0}; - extract_rti_addr_info(rti_addr, &rti_addr_info); - if (!rti_addr_info.has_host && !rti_addr_info.has_port && !rti_addr_info.has_user) { - return FAILED_TO_PARSE; +void lf_send_port_absent_to_federate( + environment_t* env, + interval_t additional_delay, + unsigned short port_ID, + unsigned short fed_ID) { + assert(env != GLOBAL_ENVIRONMENT); + + // Construct the message + size_t message_length = 1 + sizeof(port_ID) + sizeof(fed_ID) + sizeof(instant_t) + sizeof(microstep_t); + unsigned char buffer[message_length]; + + // Apply the additional delay to the current tag and use that as the intended + // tag of the outgoing message. Note that if there is delay on the connection, + // then we cannot promise no message with tag = current_tag + delay because a + // subsequent reaction might produce such a message. But we can promise no + // message with a tag strictly less than current_tag + delay. + tag_t current_message_intended_tag = lf_delay_strict(env->current_tag, additional_delay); + + LF_PRINT_LOG("Sending port " + "absent for tag " PRINTF_TAG " for port %d to federate %d.", + current_message_intended_tag.time - start_time, + current_message_intended_tag.microstep, + port_ID, fed_ID); + + buffer[0] = MSG_TYPE_PORT_ABSENT; + encode_uint16(port_ID, &(buffer[1])); + encode_uint16(fed_ID, &(buffer[1+sizeof(port_ID)])); + encode_tag(&(buffer[1+sizeof(port_ID)+sizeof(fed_ID)]), current_message_intended_tag); + +#ifdef FEDERATED_CENTRALIZED + // Send the absent message through the RTI + int* socket = &_fed.socket_TCP_RTI; +#else + // Send the absent message directly to the federate + int* socket = &_fed.sockets_for_outbound_p2p_connections[fed_ID]; +#endif + + if (socket == &_fed.socket_TCP_RTI) { + tracepoint_federate_to_rti( + _fed.trace, send_PORT_ABS, _lf_my_fed_id, ¤t_message_intended_tag); + } else { + tracepoint_federate_to_federate( + _fed.trace, send_PORT_ABS, _lf_my_fed_id, fed_ID, ¤t_message_intended_tag); } - if (rti_addr_info.has_host) { - if (validate_host(rti_addr_info.rti_host_str)) { - char* rti_host = (char*) calloc(256, sizeof(char)); - strncpy(rti_host, rti_addr_info.rti_host_str, 255); - federation_metadata.rti_host = rti_host; + + LF_MUTEX_LOCK(lf_outbound_socket_mutex); + int result = write_to_socket_close_on_error(socket, message_length, buffer); + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); + + if (result != 0) { + // Write failed. Response depends on whether coordination is centralized. + if (socket == &_fed.socket_TCP_RTI) { + // Centralized coordination. This is a critical error. + lf_print_error_system_failure("Failed to send port absent message for port %hu to federate %hu.", + port_ID, fed_ID); } else { - return INVALID_HOST; + // Decentralized coordination. This is not a critical error. + lf_print_warning("Failed to send port absent message for port %hu to federate %hu.", + port_ID, fed_ID); } } - if (rti_addr_info.has_port) { - if (validate_port(rti_addr_info.rti_port_str)) { - federation_metadata.rti_port = atoi(rti_addr_info.rti_port_str); - } else { - return INVALID_PORT; +} + +int lf_send_stop_request_to_rti(tag_t stop_tag) { + + // Send a stop request with the specified tag to the RTI + unsigned char buffer[MSG_TYPE_STOP_REQUEST_LENGTH]; + // Stop at the next microstep + stop_tag.microstep++; + ENCODE_STOP_REQUEST(buffer, stop_tag.time, stop_tag.microstep); + + LF_MUTEX_LOCK(lf_outbound_socket_mutex); + // Do not send a stop request if a stop request has been previously received from the RTI. + if (!_fed.received_stop_request_from_rti) { + LF_PRINT_LOG("Sending to RTI a MSG_TYPE_STOP_REQUEST message with tag " PRINTF_TAG ".", + stop_tag.time - start_time, + stop_tag.microstep); + + if (_fed.socket_TCP_RTI < 0) { + lf_print_warning("Socket is no longer connected. Dropping message."); + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); + return -1; } + // Trace the event when tracing is enabled + tracepoint_federate_to_rti(_fed.trace, send_STOP_REQ, _lf_my_fed_id, &stop_tag); + + write_to_socket_fail_on_error(&_fed.socket_TCP_RTI, MSG_TYPE_STOP_REQUEST_LENGTH, + buffer, &lf_outbound_socket_mutex, + "Failed to send stop time " PRINTF_TIME " to the RTI.", stop_tag.time - start_time); + + // Treat this sending as equivalent to having received a stop request from the RTI. + _fed.received_stop_request_from_rti = true; + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); + return 0; + } else { + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); + return 1; } - if (rti_addr_info.has_user) { - if (validate_user(rti_addr_info.rti_user_str)) { - char* rti_user = (char*) calloc(256, sizeof(char)); - strncpy(rti_user, rti_addr_info.rti_user_str, 255); - federation_metadata.rti_user = rti_user; +} + +int lf_send_tagged_message(environment_t* env, + interval_t additional_delay, + int message_type, + unsigned short port, + unsigned short federate, + const char* next_destination_str, + size_t length, + unsigned char* message) { + assert(env != GLOBAL_ENVIRONMENT); + + size_t header_length = 1 + sizeof(uint16_t) + sizeof(uint16_t) + + sizeof(int32_t) + sizeof(instant_t) + sizeof(microstep_t); + unsigned char header_buffer[header_length]; + + if (message_type != MSG_TYPE_TAGGED_MESSAGE && message_type != MSG_TYPE_P2P_TAGGED_MESSAGE) { + lf_print_error("lf_send_message: Unsupported message type (%d).", message_type); + return -1; + } + + size_t buffer_head = 0; + // First byte is the message type. + header_buffer[buffer_head] = (unsigned char)message_type; + buffer_head += sizeof(unsigned char); + // Next two bytes identify the destination port. + // NOTE: Send messages little endian, not big endian. + encode_uint16(port, &(header_buffer[buffer_head])); + buffer_head += sizeof(uint16_t); + + // Next two bytes identify the destination federate. + encode_uint16(federate, &(header_buffer[buffer_head])); + buffer_head += sizeof(uint16_t); + + // The next four bytes are the message length. + encode_int32((int32_t)length, &(header_buffer[buffer_head])); + buffer_head += sizeof(int32_t); + + // Apply the additional delay to the current tag and use that as the intended + // tag of the outgoing message. + tag_t current_message_intended_tag = lf_delay_tag(env->current_tag, additional_delay); + + if (_lf_is_tag_after_stop_tag(env, current_message_intended_tag)) { + // Message tag is past the timeout time (the stop time) so it should not be sent. + LF_PRINT_LOG("Dropping message because it will be after the timeout time."); + return -1; + } + + // Next 8 + 4 will be the tag (timestamp, microstep) + encode_tag( + &(header_buffer[buffer_head]), + current_message_intended_tag + ); + + LF_PRINT_LOG("Sending message with tag " PRINTF_TAG " to %s.", + current_message_intended_tag.time - start_time, + current_message_intended_tag.microstep, + next_destination_str); + + // Use a mutex lock to prevent multiple threads from simultaneously sending. + LF_MUTEX_LOCK(lf_outbound_socket_mutex); + + int* socket; + if (message_type == MSG_TYPE_P2P_TAGGED_MESSAGE) { + socket = &_fed.sockets_for_outbound_p2p_connections[federate]; + tracepoint_federate_to_federate(_fed.trace, send_P2P_TAGGED_MSG, _lf_my_fed_id, federate, ¤t_message_intended_tag); + } else { + socket = &_fed.socket_TCP_RTI; + tracepoint_federate_to_rti(_fed.trace, send_TAGGED_MSG, _lf_my_fed_id, ¤t_message_intended_tag); + } + + int result = write_to_socket_close_on_error(socket, header_length, header_buffer); + if (result == 0) { + // Header sent successfully. Send the body. + result = write_to_socket_close_on_error(socket, length, message); + } + if (result != 0) { + // Message did not send. Handling depends on message type. + if (message_type == MSG_TYPE_P2P_TAGGED_MESSAGE) { + lf_print_warning("Failed to send message to %s. Dropping the message.", next_destination_str); } else { - return INVALID_USER; + lf_print_error_system_failure("Failed to send message to %s. Connection lost to the RTI.", + next_destination_str); } } - return SUCCESS; + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); + return result; } -/** - * Sets the federation_id of this federate to fid. - */ -void set_federation_id(const char* fid) { +void lf_set_federation_id(const char* fid) { federation_metadata.federation_id = fid; } -void set_federation_trace_object(trace_t * trace) { +void lf_set_federation_trace_object(trace_t * trace) { _fed.trace = trace; } + +#ifdef FEDERATED_DECENTRALIZED +void lf_spawn_staa_thread(){ + lf_thread_create(&_fed.staaSetter, update_ports_from_staa_offsets, NULL); +} +#endif // FEDERATED_DECENTRALIZED + +void lf_stall_advance_level_federation(environment_t* env, size_t level) { + LF_PRINT_DEBUG("Acquiring the environment mutex."); + LF_MUTEX_LOCK(env->mutex); + LF_PRINT_DEBUG("Waiting on MLAA with next_reaction_level %zu and MLAA %d.", level, max_level_allowed_to_advance); + while (((int) level) >= max_level_allowed_to_advance) { + lf_cond_wait(&lf_port_status_changed); + }; + LF_PRINT_DEBUG("Exiting wait with MLAA %d and next_reaction_level %zu.", max_level_allowed_to_advance, level); + LF_MUTEX_UNLOCK(env->mutex); +} + +void lf_synchronize_with_other_federates(void) { + + LF_PRINT_DEBUG("Synchronizing with other federates."); + + // Reset the start time to the coordinated start time for all federates. + // Note that this does not grant execution to this federate. + start_time = get_start_time_from_rti(lf_time_physical()); + + // Start a thread to listen for incoming TCP messages from the RTI. + // @note Up until this point, the federate has been listening for messages + // from the RTI in a sequential manner in the main thread. From now on, a + // separate thread is created to allow for asynchronous communication. + lf_thread_create(&_fed.RTI_socket_listener, listen_to_rti_TCP, NULL); + lf_thread_t thread_id; + if (create_clock_sync_thread(&thread_id)) { + lf_print_warning("Failed to create thread to handle clock synchronization."); + } +} + +bool lf_update_max_level(tag_t tag, bool is_provisional) { + // This always needs the top-level environment, which will be env[0]. + environment_t *env; + _lf_get_environments(&env); + int prev_max_level_allowed_to_advance = max_level_allowed_to_advance; + max_level_allowed_to_advance = INT_MAX; +#ifdef FEDERATED_DECENTRALIZED + size_t action_table_size = _lf_action_table_size; + lf_action_base_t** action_table = _lf_action_table; +#else + // Note that the following test is never true for decentralized coordination, + // where tag always is NEVER_TAG. + if ((lf_tag_compare(env->current_tag, tag) < 0) || ( + lf_tag_compare(env->current_tag, tag) == 0 && !is_provisional + )) { + LF_PRINT_DEBUG("Updated MLAA to %d at time " PRINTF_TIME ".", + max_level_allowed_to_advance, + lf_time_logical_elapsed(env) + ); + // Safe to complete the current tag + return (prev_max_level_allowed_to_advance != max_level_allowed_to_advance); + } + + size_t action_table_size = _lf_zero_delay_cycle_action_table_size; + lf_action_base_t** action_table = _lf_zero_delay_cycle_action_table; +#endif // FEDERATED_DECENTRALIZED + for (int i = 0; i < action_table_size; i++) { + lf_action_base_t* input_port_action = action_table[i]; +#ifdef FEDERATED_DECENTRALIZED + // In decentralized execution, if the current_tag is close enough to the + // start tag and there is a large enough delay on an incoming + // connection, then there is no need to block progress waiting for this + // port status. This is irrelevant for centralized because blocking only + // occurs on zero-delay cycles. + if ( + (_lf_action_delay_table[i] == 0 && env->current_tag.time == start_time && env->current_tag.microstep == 0) + || (_lf_action_delay_table[i] > 0 && lf_tag_compare( + env->current_tag, + lf_delay_strict((tag_t) {.time=start_time, .microstep=0}, _lf_action_delay_table[i]) + ) <= 0) + ) { + continue; + } +#endif // FEDERATED_DECENTRALIZED + // If the current tag is greater than the last known status tag of the input port, + // and the input port is not physical, then block on that port by ensuring + // the MLAA is no greater than the level of that port. + // For centralized coordination, this is applied only to input ports coming from + // federates that are in a ZDC. For decentralized coordination, this is applied + // to all input ports. + if (lf_tag_compare(env->current_tag, + input_port_action->trigger->last_known_status_tag) > 0 + && !input_port_action->trigger->is_physical) { + max_level_allowed_to_advance = LF_MIN( + max_level_allowed_to_advance, + ((int) LF_LEVEL(input_port_action->trigger->reactions[0]->index)) + ); + } + } + LF_PRINT_DEBUG("Updated MLAA to %d at time " PRINTF_TIME ".", + max_level_allowed_to_advance, + lf_time_logical_elapsed(env) + ); + return (prev_max_level_allowed_to_advance != max_level_allowed_to_advance); +} + #endif diff --git a/core/federated/network/net_util.c b/core/federated/network/net_util.c index 99c6e6cf8..754a28ada 100644 --- a/core/federated/network/net_util.c +++ b/core/federated/network/net_util.c @@ -56,120 +56,172 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /** Number of nanoseconds to sleep before retrying a socket read. */ #define SOCKET_READ_RETRY_INTERVAL 1000000 +// Mutex lock held while performing socket close operations. +// A deadlock can occur if two threads simulataneously attempt to close the same socket. +lf_mutex_t socket_mutex; + int create_real_time_tcp_socket_errexit() { int sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); if (sock < 0) { - lf_print_error_and_exit("Could not open TCP socket. Err=%d", sock); + lf_print_error_system_failure("Could not open TCP socket."); } // Disable Nagle's algorithm which bundles together small TCP messages to - // reduce network traffic + // reduce network traffic. // TODO: Re-consider if we should do this, and whether disabling delayed ACKs - // is enough. + // is enough. int flag = 1; int result = setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, &flag, sizeof(int)); if (result < 0) { - lf_print_error_and_exit("Failed to disable Nagle algorithm on socket server."); + lf_print_error_system_failure("Failed to disable Nagle algorithm on socket server."); } +#if defined(PLATFORM_Linux) // Disable delayed ACKs. Only possible on Linux - #if defined(PLATFORM_Linux) result = setsockopt(sock, IPPROTO_TCP, TCP_QUICKACK, &flag, sizeof(int)); if (result < 0) { - lf_print_error_and_exit("Failed to disable Nagle algorithm on socket server."); + lf_print_error_system_failure("Failed to disable Nagle algorithm on socket server."); } - #endif +#endif // Linux return sock; } -ssize_t read_from_socket_errexit( - int socket, - size_t num_bytes, - unsigned char* buffer, - char* format, ...) { - va_list args; - // Error checking first - if (socket < 0 && format != NULL) { - lf_print_error("Socket is no longer open."); - lf_print_error_and_exit(format, args); - } +int read_from_socket(int socket, size_t num_bytes, unsigned char* buffer) { + if (socket < 0) { + // Socket is not open. + errno = EBADF; + return -1; + } ssize_t bytes_read = 0; + int retry_count = 0; while (bytes_read < (ssize_t)num_bytes) { ssize_t more = read(socket, buffer + bytes_read, num_bytes - (size_t)bytes_read); - if(more <= 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) { - // The error code set by the socket indicates + if(more < 0 && retry_count++ < NUM_SOCKET_RETRIES + && (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)) { + // Those error codes set by the socket indicates // that we should try again (@see man errno). - LF_PRINT_DEBUG("Reading from socket was blocked. Will try again."); + lf_print_warning("Reading from socket failed. Will try again."); + lf_sleep(DELAY_BETWEEN_SOCKET_RETRIES); continue; - } else if (more <= 0) { - if (format != NULL) { - shutdown(socket, SHUT_RDWR); - close(socket); - lf_print_error("Read %ld bytes, but expected %zu. errno=%d", - more + bytes_read, num_bytes, errno); - lf_print_error_and_exit(format, args); - } else if (more == 0) { - // According to this: https://stackoverflow.com/questions/4160347/close-vs-shutdown-socket, - // upon receiving a zero length packet or an error, we can close the socket. - // If there are any pending outgoing messages, this will attempt to send those - // followed by an EOF. - close(socket); - } - return more; + } else if (more < 0) { + // A more serious error occurred. + return -1; + } else if (more == 0) { + // EOF received. + return 1; } bytes_read += more; } - return bytes_read; + return 0; } -ssize_t read_from_socket(int socket, size_t num_bytes, unsigned char* buffer) { - return read_from_socket_errexit(socket, num_bytes, buffer, NULL); +int read_from_socket_close_on_error(int* socket, size_t num_bytes, unsigned char* buffer) { + assert(socket); + int read_failed = read_from_socket(*socket, num_bytes, buffer); + if (read_failed) { + // Read failed. + // Socket has probably been closed from the other side. + // Shut down and close the socket from this side. + shutdown(*socket, SHUT_RDWR); + close(*socket); + // Mark the socket closed. + *socket = -1; + return -1; + } + return 0; } -ssize_t write_to_socket_with_mutex( - int socket, +void read_from_socket_fail_on_error( + int* socket, size_t num_bytes, unsigned char* buffer, lf_mutex_t* mutex, char* format, ...) { + va_list args; + assert(socket); + int read_failed = read_from_socket_close_on_error(socket, num_bytes, buffer); + if (read_failed) { + // Read failed. + if (mutex != NULL) { + lf_mutex_unlock(mutex); + } + if (format != NULL) { + lf_print_error_system_failure(format, args); + } else { + lf_print_error_system_failure("Failed to read from socket."); + } + } +} + +ssize_t peek_from_socket(int socket, unsigned char* result) { + ssize_t bytes_read = recv(socket, result, 1, MSG_DONTWAIT | MSG_PEEK); + if (bytes_read < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) return 0; + else return bytes_read; +} + +int write_to_socket(int socket, size_t num_bytes, unsigned char* buffer) { + if (socket < 0) { + // Socket is not open. + errno = EBADF; + return -1; + } ssize_t bytes_written = 0; va_list args; while (bytes_written < (ssize_t)num_bytes) { ssize_t more = write(socket, buffer + bytes_written, num_bytes - (size_t)bytes_written); - if (more <= 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) { - // The error code set by the socket indicates - // that we should try again (@see man errno). + if (more <= 0 && (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)) { + // The error codes EAGAIN or EWOULDBLOCK indicate + // that we should try again (@see man errno). + // The error code EINTR means the system call was interrupted before completing. LF_PRINT_DEBUG("Writing to socket was blocked. Will try again."); + lf_sleep(DELAY_BETWEEN_SOCKET_RETRIES); continue; - } else if (more <= 0) { - if (format != NULL) { - shutdown(socket, SHUT_RDWR); - close(socket); - if (mutex != NULL) { - lf_mutex_unlock(mutex); - } - lf_print_error(format, args); - lf_print_error("Code %d: %s.", errno, strerror(errno)); - } - return more; + } else if (more < 0) { + // A more serious error occurred. + return -1; } bytes_written += more; } - return bytes_written; + return 0; +} + +int write_to_socket_close_on_error(int* socket, size_t num_bytes, unsigned char* buffer) { + assert(socket); + int result = write_to_socket(*socket, num_bytes, buffer); + if (result) { + // Write failed. + // Socket has probably been closed from the other side. + // Shut down and close the socket from this side. + shutdown(*socket, SHUT_RDWR); + close(*socket); + // Mark the socket closed. + *socket = -1; + } + return result; } -ssize_t write_to_socket_errexit( - int socket, +void write_to_socket_fail_on_error( + int* socket, size_t num_bytes, unsigned char* buffer, + lf_mutex_t* mutex, char* format, ...) { - return write_to_socket_with_mutex(socket, num_bytes, buffer, NULL, format); -} - -ssize_t write_to_socket(int socket, size_t num_bytes, unsigned char* buffer) { - return write_to_socket_with_mutex(socket, num_bytes, buffer, NULL, NULL); + va_list args; + assert(socket); + int result = write_to_socket_close_on_error(socket, num_bytes, buffer); + if (result) { + // Write failed. + if (mutex != NULL) { + lf_mutex_unlock(mutex); + } + if (format != NULL) { + lf_print_error_system_failure(format, args); + } else { + lf_print_error("Failed to write to socket. Closing it."); + } + } } #endif // FEDERATED diff --git a/core/reactor.c b/core/reactor.c index ce95b057d..ceace5125 100644 --- a/core/reactor.c +++ b/core/reactor.c @@ -377,13 +377,12 @@ int lf_reactor_c_main(int argc, const char* argv[]) { NEVER_TAG.time - start_time, 0); environment_init_tags(env, start_time, duration); - // Start tracing if enalbed + // Start tracing if enabled. start_trace(env->trace); #ifdef MODAL_REACTORS // Set up modal infrastructure _lf_initialize_modes(env); #endif - _lf_execution_started = true; _lf_trigger_startup_reactions(env); _lf_initialize_timers(env); // If the stop_tag is (0,0), also insert the shutdown @@ -394,9 +393,11 @@ int lf_reactor_c_main(int argc, const char* argv[]) { } LF_PRINT_DEBUG("Running the program's main loop."); // Handle reactions triggered at time (T,m). + env->execution_started = true; if (_lf_do_step(env)) { while (next(env) != 0); } + _lf_normal_termination = true; return 0; } else { return -1; diff --git a/core/reactor_common.c b/core/reactor_common.c index 6984d920b..938202909 100644 --- a/core/reactor_common.c +++ b/core/reactor_common.c @@ -99,12 +99,6 @@ unsigned int _lf_number_of_workers = 0u; */ instant_t duration = -1LL; -/** - * Indicates whether or not the execution - * has started. - */ -bool _lf_execution_started = false; - /** Indicator of whether the keepalive command-line option was given. */ bool keepalive_specified = false; @@ -116,6 +110,16 @@ bool keepalive_specified = false; */ interval_t _lf_fed_STA_offset = 0LL; +void _lf_print_event(void* event) { + if (event == NULL) { + printf("NULL"); + } else { + event_t* ev = (event_t*)event; + lf_print("Event: Time=" PRINTF_TIME ", dummy=%d, timer=%d", + ev->time - start_time, ev->is_dummy, ev->trigger->is_timer); + } +} + /** * Allocate memory using calloc (so the allocated memory is zeroed out) * and record the allocated memory on the specified self struct so that @@ -266,6 +270,11 @@ void _lf_trigger_reaction(environment_t* env, reaction_t* reaction, int worker_n */ void _lf_start_time_step(environment_t *env) { assert(env != GLOBAL_ENVIRONMENT); + if (!env->execution_started) { + // Execution hasn't started, so this is probably being invoked in termination + // due to an error. + return; + } LF_PRINT_LOG("--------- Start time step at tag " PRINTF_TAG ".", env->current_tag.time - start_time, env->current_tag.microstep); // Handle dynamically created tokens for mutable inputs. _lf_free_token_copies(env); @@ -291,22 +300,29 @@ void _lf_start_time_step(environment_t *env) { } } } + env->is_present_fields_abbreviated_size = 0; + +#ifdef FEDERATED + // If the environment is the top-level one, we have some work to do. + environment_t *envs; + int num_envs = _lf_get_environments(&envs); + if (env == envs) { + // This is the top-level environment. #ifdef FEDERATED_DECENTRALIZED - for (int i = 0; i < env->is_present_fields_size; i++) { - // FIXME: For now, an intended tag of (NEVER, 0) - // indicates that it has never been set. - *env->_lf_intended_tag_fields[i] = (tag_t) {NEVER, 0}; + for (int i = 0; i < env->is_present_fields_size; i++) { + // An intended tag of NEVER_TAG indicates that it has never been set. + *env->_lf_intended_tag_fields[i] = NEVER_TAG; + } +#endif // FEDERATED_DECENTRALIZED + + // Reset absent fields on network ports because + // their status is unknown + lf_reset_status_fields_on_input_port_triggers(); + // Signal the helper thread to reset its progress since the logical time has changed. + lf_cond_signal(&lf_current_tag_changed); } -#endif -#ifdef FEDERATED - // Reset absent fields on network ports because - // their status is unknown - reset_status_fields_on_input_port_triggers(); - // Signal the helper thread to reset its progress since the logical time has changed. - lf_cond_signal(&logical_time_changed); -#endif - env->is_present_fields_abbreviated_size = 0; +#endif // FEDERATED } /** @@ -349,8 +365,8 @@ void _lf_pop_events(environment_t *env) { } #ifdef MODAL_REACTORS - // If this event is associated with an incative it should haven been suspended and no longer on the event queue. - // FIXME This should not be possible + // If this event is associated with an inactive mode it should haven been suspended and no longer on the event queue. + // NOTE: This should not be possible if (!_lf_mode_is_active(event->trigger->mode)) { lf_print_warning("Assumption violated. There is an event on the event queue that is associated to an inactive mode."); } @@ -366,22 +382,28 @@ void _lf_pop_events(environment_t *env) { #ifdef FEDERATED_DECENTRALIZED // In federated execution, an intended tag that is not (NEVER, 0) // indicates that this particular event is triggered by a network message. - // The intended tag is set in handle_timed_message in federate.c whenever - // a timed message arrives from another federate. + // The intended tag is set in handle_tagged_message in federate.c whenever + // a tagged message arrives from another federate. if (event->intended_tag.time != NEVER) { // If the intended tag of the event is actually set, // transfer the intended tag to the trigger so that // the reaction can access the value. event->trigger->intended_tag = event->intended_tag; // And check if it is in the past compared to the current tag. - if (lf_tag_compare(event->intended_tag, - env->current_tag) < 0) { + if (lf_tag_compare(event->intended_tag, env->current_tag) < 0) { // Mark the triggered reaction with a STP violation reaction->is_STP_violated = true; LF_PRINT_LOG("Trigger %p has violated the reaction's STP offset. Intended tag: " PRINTF_TAG ". Current tag: " PRINTF_TAG, event->trigger, event->intended_tag.time - start_time, event->intended_tag.microstep, env->current_tag.time - start_time, env->current_tag.microstep); + // Need to update the last_known_status_tag of the port because otherwise, + // the MLAA could get stuck, causing the program to lock up. + // This should not call update_last_known_status_on_input_port because we + // are starting a new tag step execution, so there are no reactions blocked on this input. + if (lf_tag_compare(env->current_tag, event->trigger->last_known_status_tag) > 0) { + event->trigger->last_known_status_tag = env->current_tag; + } } } #endif @@ -643,8 +665,8 @@ static void _lf_replace_token(event_t* event, lf_token_t* token) { /** * Schedule events at a specific tag (time, microstep), provided - * that the tag is in the future relative to the current tag. - * The input time values are absolute. + * that the tag is in the future relative to the current tag (or the + * environment has not started executing). The input time values are absolute. * * If there is an event found at the requested tag, the payload * is replaced and 0 is returned. @@ -665,18 +687,19 @@ static void _lf_replace_token(event_t* event, lf_token_t* token) { * @param tag Logical tag of the event * @param token The token wrapping the payload or NULL for no payload. * - * @return 1 for success, 0 if no new event was scheduled (instead, the payload was updated), - * or -1 for error (the tag is equal to or less than the current tag). + * @return A positive trigger handle for success, 0 if no new event was scheduled + * (instead, the payload was updated), or -1 for error (the tag is equal to or less + * than the current tag). */ -int _lf_schedule_at_tag(environment_t* env, trigger_t* trigger, tag_t tag, lf_token_t* token) { +trigger_handle_t _lf_schedule_at_tag(environment_t* env, trigger_t* trigger, tag_t tag, lf_token_t* token) { assert(env != GLOBAL_ENVIRONMENT); tag_t current_logical_tag = env->current_tag; LF_PRINT_DEBUG("_lf_schedule_at_tag() called with tag " PRINTF_TAG " at tag " PRINTF_TAG ".", tag.time - start_time, tag.microstep, current_logical_tag.time - start_time, current_logical_tag.microstep); - if (lf_tag_compare(tag, current_logical_tag) <= 0) { - lf_print_warning("_lf_schedule_at_tag(): requested to schedule an event in the past."); + if (lf_tag_compare(tag, current_logical_tag) <= 0 && env->execution_started) { + lf_print_warning("_lf_schedule_at_tag(): requested to schedule an event at the current or past tag."); return -1; } @@ -827,10 +850,11 @@ int _lf_schedule_at_tag(environment_t* env, trigger_t* trigger, tag_t tag, lf_to if (tag.time == current_logical_tag.time) { relative_microstep -= current_logical_tag.microstep; } - if (((tag.time == current_logical_tag.time) && (relative_microstep == 1)) || + if ((tag.time == current_logical_tag.time && relative_microstep == 1 && env->execution_started) || tag.microstep == 0) { // Do not need a dummy event if we are scheduling at 1 microstep // in the future at current time or at microstep 0 in a future time. + // Note that if execution hasn't started, then we have to insert dummy events. pqueue_insert(env->event_q, e); } else { // Create a dummy event. Insert it into the queue, and let its next @@ -838,7 +862,11 @@ int _lf_schedule_at_tag(environment_t* env, trigger_t* trigger, tag_t tag, lf_to pqueue_insert(env->event_q, _lf_create_dummy_events(env, trigger, tag.time, e, relative_microstep)); } } - return 1; + trigger_handle_t return_value = env->_lf_handle++; + if (env->_lf_handle < 0) { + env->_lf_handle = 1; + } + return return_value; } /** @@ -918,7 +946,8 @@ trigger_handle_t _lf_schedule(environment_t *env, trigger_t* trigger, interval_t if (!trigger->is_timer) { delay += trigger->offset; } - interval_t intended_time = env->current_tag.time + delay; + tag_t intended_tag = (tag_t){.time = env->current_tag.time + delay, .microstep = 0}; + LF_PRINT_DEBUG("_lf_schedule: env->current_tag.time = " PRINTF_TIME ". Total logical delay = " PRINTF_TIME "", env->current_tag.time, delay); interval_t min_spacing = trigger->period; @@ -940,7 +969,7 @@ trigger_handle_t _lf_schedule(environment_t *env, trigger_t* trigger, interval_t // modify the intended time. if (trigger->is_physical) { // Get the current physical time and assign it as the intended time. - intended_time = lf_time_physical() + delay; + intended_tag.time = lf_time_physical() + delay; } else { // FIXME: We need to verify that we are executing within a reaction? // See reactor_threaded. @@ -951,11 +980,11 @@ trigger_handle_t _lf_schedule(environment_t *env, trigger_t* trigger, interval_t // - we have eliminated the possibility to have a negative additional delay; and // - we detect the asynchronous use of logical actions #ifndef NDEBUG - if (intended_time < env->current_tag.time) { + if (intended_tag.time < env->current_tag.time) { lf_print_warning("Attempting to schedule an event earlier than current time by " PRINTF_TIME " nsec! " "Revising to the current time " PRINTF_TIME ".", - env->current_tag.time - intended_time, env->current_tag.time); - intended_time = env->current_tag.time; + env->current_tag.time - intended_tag.time, env->current_tag.time); + intended_tag.time = env->current_tag.time; } #endif } @@ -969,7 +998,6 @@ trigger_handle_t _lf_schedule(environment_t *env, trigger_t* trigger, interval_t // Check for conflicts (a queued event with the same trigger and time). if (min_spacing <= 0) { // No minimum spacing defined. - tag_t intended_tag = (tag_t) {.time = intended_time, .microstep = 0u}; e->time = intended_tag.time; event_t* found = (event_t *)pqueue_find_equal_same_priority(env->event_q, e); // Check for conflicts. Let events pile up in super dense time. @@ -989,23 +1017,23 @@ trigger_handle_t _lf_schedule(environment_t *env, trigger_t* trigger, interval_t } // Hook the event into the list. found->next = e; - trigger->last_time = intended_tag.time; + trigger->last_tag = intended_tag; return(0); // FIXME: return value } // If there are not conflicts, schedule as usual. If intended time is // equal to the current logical time, the event will effectively be // scheduled at the next microstep. - } else if (!trigger->is_timer && trigger->last_time != NEVER) { + } else if (!trigger->is_timer && trigger->last_tag.time != NEVER) { // There is a min_spacing and there exists a previously // scheduled event. It determines the // earliest time at which the new event can be scheduled. // Check to see whether the event is too early. - instant_t earliest_time = trigger->last_time + min_spacing; + instant_t earliest_time = trigger->last_tag.time + min_spacing; LF_PRINT_DEBUG("There is a previously scheduled event; earliest possible time " "with min spacing: " PRINTF_TIME, earliest_time); // If the event is early, see which policy applies. - if (earliest_time > intended_time) { + if (earliest_time > intended_tag.time) { LF_PRINT_DEBUG("Event is early."); switch(trigger->policy) { case drop: @@ -1023,7 +1051,7 @@ trigger_handle_t _lf_schedule(environment_t *env, trigger_t* trigger, interval_t event_t* dummy = _lf_get_new_event(env); dummy->next = NULL; dummy->trigger = trigger; - dummy->time = trigger->last_time; + dummy->time = trigger->last_tag.time; event_t* found = (event_t *)pqueue_find_equal_same_priority(env->event_q, dummy); if (found != NULL) { @@ -1032,18 +1060,18 @@ trigger_handle_t _lf_schedule(environment_t *env, trigger_t* trigger, interval_t _lf_replace_token(found, token); _lf_recycle_event(env, e); _lf_recycle_event(env, dummy); - // Leave the last_time the same. + // Leave the last_tag the same. return(0); } _lf_recycle_event(env, dummy); // If the preceding event _has_ been handled, then adjust // the tag to defer the event. - intended_time = earliest_time; + intended_tag = (tag_t){.time = earliest_time, .microstep = 0}; break; default: // Default policy is defer - intended_time = earliest_time; + intended_tag = (tag_t){.time = earliest_time, .microstep = 0}; break; } } @@ -1054,16 +1082,16 @@ trigger_handle_t _lf_schedule(environment_t *env, trigger_t* trigger, interval_t // FIXME: This is a development assertion and might // not be necessary for end-user LF programs #ifndef NDEBUG - if (intended_time < env->current_tag.time) { + if (intended_tag.time < env->current_tag.time) { lf_print_error("Attempting to schedule an event earlier than current time by " PRINTF_TIME " nsec! " "Revising to the current time " PRINTF_TIME ".", - env->current_tag.time - intended_time, env->current_tag.time); - intended_time = env->current_tag.time; + env->current_tag.time - intended_tag.time, env->current_tag.time); + intended_tag.time = env->current_tag.time; } #endif // Set the tag of the event. - e->time = intended_time; + e->time = intended_tag.time; // Do not schedule events if if the event time is past the stop time // (current microsteps are checked earlier). @@ -1077,7 +1105,7 @@ trigger_handle_t _lf_schedule(environment_t *env, trigger_t* trigger, interval_t // Store the time in order to check the min spacing // between this and any following event. - trigger->last_time = intended_time; + trigger->last_tag = intended_tag; // Queue the event. // NOTE: There is no need for an explicit microstep because @@ -1096,7 +1124,7 @@ trigger_handle_t _lf_schedule(environment_t *env, trigger_t* trigger, interval_t // NOTE: Rather than wrapping around to get a negative number, // we reset the handle on the assumption that much earlier // handles are irrelevant. - int return_value = env->_lf_handle++; + trigger_handle_t return_value = env->_lf_handle++; if (env->_lf_handle < 0) { env->_lf_handle = 1; } @@ -1367,7 +1395,7 @@ void schedule_output_reactions(environment_t *env, reaction_t* reaction, int wor #ifdef FEDERATED_DECENTRALIZED // Only pass down STP violation for federated programs that use decentralized coordination. // Extract the inherited STP violation bool inherited_STP_violation = reaction->is_STP_violated; - LF_PRINT_LOG("Reaction %s has STP violation status: %d.", reaction->name, reaction->is_STP_violated); + LF_PRINT_DEBUG("Reaction %s has STP violation status: %d.", reaction->name, reaction->is_STP_violated); #endif LF_PRINT_DEBUG("There are %zu outputs from reaction %s.", reaction->num_outputs, reaction->name); for (size_t i=0; i < reaction->num_outputs; i++) { @@ -1525,6 +1553,8 @@ void usage(int argc, const char* argv[]) { #ifdef FEDERATED printf(" -r, --rti \n"); printf(" The address of the RTI, which can be in the form of user@host:port or ip:port.\n\n"); + printf(" -l\n"); + printf(" Send stdout to individual log files for each federate.\n\n"); #endif printf("Command given:\n"); @@ -1648,7 +1678,7 @@ int process_args(int argc, const char* argv[]) { return 0; } const char* fid = argv[i++]; - set_federation_id(fid); + lf_set_federation_id(fid); lf_print("Federation ID for executable %s: %s", argv[0], fid); } else if (strcmp(arg, "-r") == 0 || strcmp(arg, "--rti") == 0) { if (argc < i + 1) { @@ -1656,7 +1686,7 @@ int process_args(int argc, const char* argv[]) { usage(argc, argv); return 0; } - parse_rti_code_t code = parse_rti_addr(argv[i++]); + parse_rti_code_t code = lf_parse_rti_addr(argv[i++]); if (code != SUCCESS) { switch (code) { case INVALID_HOST: @@ -1707,87 +1737,110 @@ void initialize_global(void) { // Federation trace object must be set before `initialize_trigger_objects` is called because it // uses tracing functionality depending on that pointer being set. #ifdef FEDERATED - set_federation_trace_object(envs->trace); + lf_set_federation_trace_object(envs->trace); #endif // Call the code-generated function to initialize all actions, timers, and ports // This is done for all environments/enclaves at the same time. _lf_initialize_trigger_objects() ; } +/** + * Flag to prevent termination function from executing twice and to signal to background + * threads to terminate. + */ +bool _lf_termination_executed = false; + +/** Flag used to disable cleanup operations on abnormal termination. */ +bool _lf_normal_termination = false; + /** * Report elapsed logical and physical times and report if any * memory allocated by set_new, set_new_array, or lf_writable_copy * has not been freed. */ void termination(void) { + if (_lf_termination_executed) return; + _lf_termination_executed = true; + environment_t *env; int num_envs = _lf_get_environments(&env); // Invoke the code generated termination function. It terminates the federated related services. - // It should only be called for the top-level environment, which, after convention, is the first environment. + // It should only be called for the top-level environment, which, by convention, is the first environment. terminate_execution(env); - // In order to free tokens, we perform the same actions we would have for a new time step. - for (int i = 0; iid); - if (!env->initialized) { - lf_print_warning("---- Environment %u was never initialized", env->id); + for (int i = 0; i < num_envs; i++) { + if (!env[i].initialized) { + lf_print_warning("---- Environment %u was never initialized", env[i].id); continue; } + LF_PRINT_LOG("---- Terminating environment %u, normal termination: %d", env[i].id, _lf_normal_termination); // Stop any tracing, if it is running. - stop_trace_locked(env->trace); + // No need to acquire a mutex because if this is normal termination, all + // other threads have stopped, and if it's not, then acquiring a mutex could + // lead to a deadlock. + stop_trace_locked(env[i].trace); - _lf_start_time_step(env); + // Skip most cleanup on abnormal termination. + if (_lf_normal_termination) { + _lf_start_time_step(&env[i]); #ifdef MODAL_REACTORS - // Free events and tokens suspended by modal reactors. - _lf_terminate_modal_reactors(env); + // Free events and tokens suspended by modal reactors. + _lf_terminate_modal_reactors(&env[i]); #endif - - // If the event queue still has events on it, report that. - if (env->event_q != NULL && pqueue_size(env->event_q) > 0) { - lf_print_warning("---- There are %zu unprocessed future events on the event queue.", pqueue_size(env->event_q)); - event_t* event = (event_t*)pqueue_peek(env->event_q); - interval_t event_time = event->time - start_time; - lf_print_warning("---- The first future event has timestamp " PRINTF_TIME " after start time.", event_time); - } - // Print elapsed times. - // If these are negative, then the program failed to start up. - interval_t elapsed_time = lf_time_logical_elapsed(env); - if (elapsed_time >= 0LL) { - char time_buffer[29]; // 28 bytes is enough for the largest 64 bit number: 9,223,372,036,854,775,807 - lf_comma_separated_time(time_buffer, elapsed_time); - printf("---- Elapsed logical time (in nsec): %s\n", time_buffer); - - // If start_time is 0, then execution didn't get far enough along - // to initialize this. - if (start_time > 0LL) { - lf_comma_separated_time(time_buffer, lf_time_physical_elapsed()); - printf("---- Elapsed physical time (in nsec): %s\n", time_buffer); + // If the event queue still has events on it, report that. + if (env[i].event_q != NULL && pqueue_size(env[i].event_q) > 0) { + lf_print_warning("---- There are %zu unprocessed future events on the event queue.", pqueue_size(env[i].event_q)); + event_t* event = (event_t*)pqueue_peek(env[i].event_q); + interval_t event_time = event->time - start_time; + lf_print_warning("---- The first future event has timestamp " PRINTF_TIME " after start time.", event_time); + } + // Print elapsed times. + // If these are negative, then the program failed to start up. + interval_t elapsed_time = lf_time_logical_elapsed(&env[i]); + if (elapsed_time >= 0LL) { + char time_buffer[29]; // 28 bytes is enough for the largest 64 bit number: 9,223,372,036,854,775,807 + lf_comma_separated_time(time_buffer, elapsed_time); + printf("---- Elapsed logical time (in nsec): %s\n", time_buffer); + + // If start_time is 0, then execution didn't get far enough along + // to initialize this. + if (start_time > 0LL) { + lf_comma_separated_time(time_buffer, lf_time_physical_elapsed()); + printf("---- Elapsed physical time (in nsec): %s\n", time_buffer); + } } } - - // Free up memory associated with environment - environment_free(env); - - env++; - } - _lf_free_all_tokens(); // Must be done before freeing reactors. - // Issue a warning if a memory leak has been detected. - if (_lf_count_payload_allocations > 0) { - lf_print_warning("Memory allocated for messages has not been freed."); - lf_print_warning("Number of unfreed messages: %d.", _lf_count_payload_allocations); - } - if (_lf_count_token_allocations > 0) { - lf_print_warning("Memory allocated for tokens has not been freed!"); - lf_print_warning("Number of unfreed tokens: %d.", _lf_count_token_allocations); } + // Skip most cleanup on abnormal termination. + if (_lf_normal_termination) { + _lf_free_all_tokens(); // Must be done before freeing reactors. + // Issue a warning if a memory leak has been detected. + if (_lf_count_payload_allocations > 0) { + lf_print_warning("Memory allocated for messages has not been freed."); + lf_print_warning("Number of unfreed messages: %d.", _lf_count_payload_allocations); + } + if (_lf_count_token_allocations > 0) { + lf_print_warning("Memory allocated for tokens has not been freed!"); + lf_print_warning("Number of unfreed tokens: %d.", _lf_count_token_allocations); + } #if !defined(LF_SINGLE_THREADED) - for (int i = 0; i < _lf_watchdog_count; i++) { - if (_lf_watchdogs[i].base->reactor_mutex != NULL) { - free(_lf_watchdogs[i].base->reactor_mutex); + for (int i = 0; i < _lf_watchdog_count; i++) { + if (_lf_watchdogs[i].base->reactor_mutex != NULL) { + free(_lf_watchdogs[i].base->reactor_mutex); + } } - } #endif - _lf_free_all_reactors(); + _lf_free_all_reactors(); + + // Free up memory associated with environment. + // Do this last so that printed warnings don't access freed memory. + for (int i = 0; i < num_envs; i++) { + environment_free(&env[i]); + } +#if defined LF_ENCLAVES + free_local_rti(); +#endif + } } diff --git a/core/tag.c b/core/tag.c index eee10699f..c632476c0 100644 --- a/core/tag.c +++ b/core/tag.c @@ -99,13 +99,14 @@ instant_t _lf_physical_time() { _lf_last_reported_physical_time_ns = adjusted_clock_ns; } + /* Possibly useful, but usually noisy: LF_PRINT_DEBUG("Physical time: " PRINTF_TIME ". Elapsed: " PRINTF_TIME ". Offset: " PRINTF_TIME, _lf_last_reported_physical_time_ns, _lf_last_reported_physical_time_ns - start_time, _lf_time_physical_clock_offset + _lf_time_test_physical_clock_offset); - + */ return _lf_last_reported_physical_time_ns; } @@ -119,6 +120,7 @@ tag_t lf_tag(void *env) { tag_t lf_tag_add(tag_t a, tag_t b) { if (a.time == NEVER || b.time == NEVER) return NEVER_TAG; if (a.time == FOREVER || b.time == FOREVER) return FOREVER_TAG; + if (b.time > 0) a.microstep = 0; // Ignore microstep of first arg if time of second is > 0. tag_t result = {.time = a.time + b.time, .microstep = a.microstep + b.microstep}; if (result.microstep < a.microstep) return FOREVER_TAG; if (result.time < a.time && b.time > 0) return FOREVER_TAG; diff --git a/core/threaded/reactor_threaded.c b/core/threaded/reactor_threaded.c index a462dd77c..b24d70390 100644 --- a/core/threaded/reactor_threaded.c +++ b/core/threaded/reactor_threaded.c @@ -65,16 +65,6 @@ extern instant_t start_time; */ #define MAX_STALL_INTERVAL MSEC(1) -/** - * Unless the "fast" option is given, an LF program will wait until - * physical time matches logical time before handling an event with - * a given logical time. The amount of time is less than this given - * threshold, then no wait will occur. The purpose of this is - * to prevent unnecessary delays caused by simply setting up and - * performing the wait. - */ -#define MIN_SLEEP_DURATION USEC(10) - /** * Global mutex, used for synchronizing across environments. Mainly used for token-management and tracing */ @@ -241,28 +231,26 @@ void _lf_set_present(lf_port_base_t* port) { } } -// Forward declaration. See federate.h -void synchronize_with_other_federates(void); - /** * Wait until physical time matches or exceeds the specified logical time, - * unless -fast is given. + * unless -fast is given. For decentralized coordination, this function will + * add the STA offset to the wait time. * * If an event is put on the event queue during the wait, then the wait is * interrupted and this function returns false. It also returns false if the - * timeout time is reached before the wait has completed. + * timeout time is reached before the wait has completed. Note this this could + * return true even if the a new event was placed on the queue if that event + * time matches or exceeds the specified time. * - * The mutex lock is assumed to be held by the calling thread. - * Note this this could return true even if the a new event - * was placed on the queue if that event time matches or exceeds - * the specified time. + * The mutex lock associated with the condition argument is assumed to be held by + * the calling thread. This mutex is released while waiting. If the wait time is + * too small to actually wait (less than MIN_SLEEP_DURATION), then this function + * immediately returns true and the mutex is not released. * * @param env Environment within which we are executing. * @param logical_time Logical time to wait until physical time matches it. - * @param return_if_interrupted If this is false, then wait_util will wait - * until physical time matches the logical time regardless of whether new - * events get put on the event queue. This is useful, for example, for - * synchronizing the start of the program. + * @param condition A condition variable that can interrupt the wait. The mutex + * associated with this condition variable will be released during the wait. * * @return Return false if the wait is interrupted either because of an event * queue signal or if the wait time was interrupted early by reaching @@ -393,11 +381,6 @@ tag_t get_next_event_tag(environment_t *env) { return next_tag; } -#ifdef FEDERATED_CENTRALIZED -// The following is defined in federate.c and used in the following function. -tag_t _lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply); -#endif - /** * In a federated execution with centralized coordination, this function returns * a tag that is less than or equal to the specified tag when, as far @@ -414,7 +397,7 @@ tag_t _lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply */ tag_t send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply) { #if defined(FEDERATED_CENTRALIZED) - return _lf_send_next_event_tag(env, tag, wait_for_reply); + return lf_send_next_event_tag(env, tag, wait_for_reply); #elif defined(LF_ENCLAVES) return rti_next_event_tag_locked(env->enclave_info, tag); #else @@ -581,10 +564,10 @@ void _lf_next_locked(environment_t *env) { // stick them into the reaction queue. _lf_pop_events(env); #ifdef FEDERATED - enqueue_port_absent_reactions(env); + lf_enqueue_port_absent_reactions(env); // _lf_pop_events may have set some triggers present. extern federate_instance_t _fed; - update_max_level(_fed.last_TAG, _fed.is_last_TAG_provisional); + lf_update_max_level(_fed.last_TAG, _fed.is_last_TAG_provisional); #endif } @@ -596,9 +579,11 @@ bool lf_stop_requested = false; // See reactor.h for docs. void lf_request_stop() { // If a requested stop is pending, return without doing anything. + LF_PRINT_LOG("lf_request_stop() has been called."); lf_mutex_lock(&global_mutex); if (lf_stop_requested) { lf_mutex_unlock(&global_mutex); + LF_PRINT_LOG("Ignoring redundant lf_request_stop() call."); return; } lf_stop_requested = true; @@ -620,10 +605,10 @@ void lf_request_stop() { } #ifdef FEDERATED - // In the federated case, do not set lf_stop_requested because the RTI might grant a + // In the federated case, the RTI might grant a // later stop tag than the current tag. The above code has raised - // a barrier no greater than the requested stop tag for each enclave. - if (_lf_fd_send_stop_request_to_rti(max_current_tag) != 0) { + // a barrier no greater than max_current_tag. + if (lf_send_stop_request_to_rti(max_current_tag) != 0) { // Message was not sent to the RTI. // Decrement the barriers to reverse our previous increment. for (int i = 0; i < num_environments; i++) { @@ -698,10 +683,10 @@ void _lf_initialize_start_tag(environment_t *env) { if (env == top_level_env) { // Reset status fields before talking to the RTI to set network port // statuses to unknown - reset_status_fields_on_input_port_triggers(); + lf_reset_status_fields_on_input_port_triggers(); // Get a start_time from the RTI - synchronize_with_other_federates(); // Resets start_time in federated execution according to the RTI. + lf_synchronize_with_other_federates(); // Resets start_time in federated execution according to the RTI. } // The start time will likely have changed. Adjust the current tag and stop tag. @@ -719,7 +704,7 @@ void _lf_initialize_start_tag(environment_t *env) { env->current_tag = (tag_t){.time = start_time - _lf_fed_STA_offset, .microstep = 0u}; // Call wait_until if federated. This is required because the startup procedure - // in synchronize_with_other_federates() can decide on a new start_time that is + // in lf_synchronize_with_other_federates() can decide on a new start_time that is // larger than the current physical time. // Therefore, if --fast was not specified, wait until physical time matches // or exceeds the start time. Microstep is ignored. @@ -745,10 +730,6 @@ void _lf_initialize_start_tag(environment_t *env) { // Restore the current tag to match the start time. env->current_tag = (tag_t){.time = start_time, .microstep = 0u}; - // For messages that may have arrived while we were waiting, put - // reactions on the reaction queue. - _lf_pop_events(env); - // If the stop_tag is (0,0), also insert the shutdown // reactions. This can only happen if the timeout time // was set to 0. @@ -765,13 +746,18 @@ void _lf_initialize_start_tag(environment_t *env) { // once the complete message has been read. Here, we wait for that barrier // to be removed, if appropriate before proceeding to executing tag (0,0). _lf_wait_on_tag_barrier(env, (tag_t){.time=start_time,.microstep=0}); - spawn_staa_thread(); + lf_spawn_staa_thread(); #else // NOT FEDERATED_DECENTRALIZED // Each federate executes the start tag (which is the current // tag). Inform the RTI of this if needed. send_next_event_tag(env, env->current_tag, true); #endif // NOT FEDERATED_DECENTRALIZED + + // For messages that may have arrived while we were waiting, put + // reactions on the reaction queue. + _lf_pop_events(env); + #else // NOT FEDERATED _lf_initialize_timers(env); @@ -785,7 +771,7 @@ void _lf_initialize_start_tag(environment_t *env) { // Set the following boolean so that other thread(s), including federated threads, // know that the execution has started - _lf_execution_started = true; + env->execution_started = true; } /** For logging and debugging, each worker thread is numbered. */ @@ -846,7 +832,7 @@ bool _lf_worker_handle_deadline_violation_for_reaction(environment_t *env, int w * @param worker_number The ID of the worker. * @param reaction The reaction whose STP offset has been violated. * - * @return true if an STP violation occurred. false otherwise. + * @return true if an STP violation occurred and was handled. false otherwise. */ bool _lf_worker_handle_STP_violation_for_reaction(environment_t* env, int worker_number, reaction_t* reaction) { bool violation_occurred = false; @@ -877,6 +863,10 @@ bool _lf_worker_handle_STP_violation_for_reaction(environment_t* env, int worker violation_occurred = true; (*handler)(reaction->self); + // Reset the STP violation flag because it has been dealt with. + // Downstream handlers should not be invoked. + reaction->is_STP_violated = false; + // If the reaction produced outputs, put the resulting // triggered reactions into the queue or execute them directly if possible. schedule_output_reactions(env, reaction, worker_number); @@ -907,7 +897,7 @@ bool _lf_worker_handle_STP_violation_for_reaction(environment_t* env, int worker * @param worker_number The ID of the worker. * @param reaction The reaction. * - * @return true if a violation occurred. false otherwise. + * @return true if a violation occurred and was handled. false otherwise. */ bool _lf_worker_handle_violations(environment_t *env, int worker_number, reaction_t* reaction) { bool violation = false; @@ -944,7 +934,7 @@ void _lf_worker_invoke_reaction(environment_t *env, int worker_number, reaction_ void try_advance_level(environment_t* env, volatile size_t* next_reaction_level) { #ifdef FEDERATED - stall_advance_level_federation(env, *next_reaction_level); + lf_stall_advance_level_federation(env, *next_reaction_level); #endif if (*next_reaction_level < SIZE_MAX) *next_reaction_level += 1; } @@ -966,7 +956,7 @@ void _lf_worker_do_work(environment_t *env, int worker_number) { // lf_print_snapshot(); // This is quite verbose (but very useful in debugging reaction deadlocks). reaction_t* current_reaction_to_execute = NULL; #ifdef FEDERATED - stall_advance_level_federation(env, 0); + lf_stall_advance_level_federation(env, 0); #endif while ((current_reaction_to_execute = lf_sched_get_ready_reaction(env->scheduler, worker_number)) @@ -1151,6 +1141,10 @@ int lf_reactor_c_main(int argc, const char* argv[]) { // Ignore SIGPIPE errors, which terminate the entire application if // socket write() fails because the reader has closed the socket. // Instead, cause an EPIPE error to be set when write() fails. + // NOTE: The reason for a broken socket causing a SIGPIPE signal + // instead of just having write() return an error is to robutly + // a foo | bar pipeline where bar crashes. The default behavior + // is for foo to also exit. signal(SIGPIPE, SIG_IGN); #endif // SIGPIPE @@ -1239,9 +1233,7 @@ int lf_reactor_c_main(int argc, const char* argv[]) { LF_PRINT_LOG("---- All worker threads exited successfully."); } } -#if defined LF_ENCLAVES - free_local_rti(); -#endif + _lf_normal_termination = true; return 0; } diff --git a/core/threaded/scheduler_NP.c b/core/threaded/scheduler_NP.c index 9e032017b..aa3549e0d 100644 --- a/core/threaded/scheduler_NP.c +++ b/core/threaded/scheduler_NP.c @@ -128,7 +128,7 @@ int _lf_sched_distribute_ready_reactions(lf_scheduler_t* scheduler) { scheduler->next_reaction_level - 1 ]; - LF_PRINT_DEBUG("DEBUG: start of rxn queue at %lu is %p", scheduler->next_reaction_level - 1, ((reaction_t**)scheduler->executing_reactions)[0]); + LF_PRINT_DEBUG("Start of rxn queue at %lu is %p", scheduler->next_reaction_level - 1, ((reaction_t**)scheduler->executing_reactions)[0]); if (((reaction_t**)scheduler->executing_reactions)[0] != NULL) { // There is at least one reaction to execute return 1; diff --git a/core/threaded/scheduler_sync_tag_advance.c b/core/threaded/scheduler_sync_tag_advance.c index 017dda77d..28d3fa458 100644 --- a/core/threaded/scheduler_sync_tag_advance.c +++ b/core/threaded/scheduler_sync_tag_advance.c @@ -52,7 +52,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /** * @brief Indicator that execution of at least one tag has completed. */ -static bool _lf_logical_tag_completed = false; +static bool _latest_tag_completed = false; /** * Return true if the worker should stop now; false otherwise. @@ -60,7 +60,7 @@ static bool _lf_logical_tag_completed = false; */ bool should_stop_locked(lf_scheduler_t * sched) { // If this is not the very first step, check against the stop tag to see whether this is the last step. - if (_lf_logical_tag_completed) { + if (_latest_tag_completed) { // If we are at the stop tag, do not call _lf_next_locked() // to prevent advancing the logical time. if (lf_tag_compare(sched->env->current_tag, sched->env->stop_tag) >= 0) { @@ -92,7 +92,7 @@ bool _lf_sched_advance_tag_locked(lf_scheduler_t * sched) { return true; } - _lf_logical_tag_completed = true; + _latest_tag_completed = true; // Advance time. // _lf_next_locked() may block waiting for real time to pass or events to appear. diff --git a/core/trace.c b/core/trace.c index 6fffbc7bf..34b7cd5d2 100644 --- a/core/trace.c +++ b/core/trace.c @@ -53,7 +53,6 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fprintf(stderr, "WARNING: Access to trace file failed.\n"); \ fclose(trace->_lf_trace_file); \ trace->_lf_trace_file = NULL; \ - lf_critical_section_exit(trace->env); \ return -1; \ } while(0) @@ -196,7 +195,10 @@ void flush_trace_locked(trace_t* trace, int worker) { // This is deferred to here so that user trace objects can be // registered in startup reactions. if (!trace->_lf_trace_header_written) { - write_trace_header(trace); + if (write_trace_header(trace) < 0) { + lf_print_error("Failed to write trace header. Trace file will be incomplete."); + return; + } trace->_lf_trace_header_written = true; } @@ -482,8 +484,10 @@ void stop_trace_locked(trace_t* trace) { flush_trace_locked(trace, 0); } trace->_lf_trace_stop = 1; - fclose(trace->_lf_trace_file); - trace->_lf_trace_file = NULL; + if (trace->_lf_trace_file != NULL) { + fclose(trace->_lf_trace_file); + trace->_lf_trace_file = NULL; + } LF_PRINT_DEBUG("Stopped tracing."); } diff --git a/core/utils/pqueue_tag.c b/core/utils/pqueue_tag.c index 579926f99..2d05af7bc 100644 --- a/core/utils/pqueue_tag.c +++ b/core/utils/pqueue_tag.c @@ -132,6 +132,16 @@ int pqueue_tag_insert_if_no_match(pqueue_tag_t* q, tag_t t) { } } +pqueue_tag_element_t* pqueue_tag_peek(pqueue_tag_t* q) { + return (pqueue_tag_element_t*) pqueue_peek((pqueue_t*)q); +} + +tag_t pqueue_tag_peek_tag(pqueue_tag_t* q) { + pqueue_tag_element_t* element = (pqueue_tag_element_t*)pqueue_tag_peek(q); + if (element == NULL) return FOREVER_TAG; + else return element->tag; +} + pqueue_tag_element_t* pqueue_tag_pop(pqueue_tag_t* q) { return (pqueue_tag_element_t*)pqueue_pop((pqueue_t*)q); } @@ -146,10 +156,14 @@ tag_t pqueue_tag_pop_tag(pqueue_tag_t* q) { } } -int pqueue_tag_remove(pqueue_tag_t* q, pqueue_tag_element_t* e) { - return pqueue_remove((pqueue_t*) q, (void*) e); +void pqueue_tag_remove(pqueue_tag_t* q, pqueue_tag_element_t* e) { + pqueue_remove((pqueue_t*) q, (void*) e); } -pqueue_tag_element_t* pqueue_tag_peek(pqueue_tag_t* q) { - return (pqueue_tag_element_t*) pqueue_peek((pqueue_t*)q); -} +void pqueue_tag_remove_up_to(pqueue_tag_t* q, tag_t t){ + tag_t head = pqueue_tag_peek_tag(q); + while (lf_tag_compare(head, FOREVER_TAG) < 0 && lf_tag_compare(head, t) <= 0) { + pqueue_tag_pop(q); + head = pqueue_tag_peek_tag(q); + } +} \ No newline at end of file diff --git a/core/utils/util.c b/core/utils/util.c index 23daef364..f03403eaf 100644 --- a/core/utils/util.c +++ b/core/utils/util.c @@ -79,6 +79,13 @@ void _lf_message_print( int is_error, const char* prefix, const char* format, va_list args, int log_level ) ATTRIBUTE_FORMAT_PRINTF(3, 0); +/** + * Print a fatal error message. Internal function. + */ +static void lf_vprint_fatal_error(const char* format, va_list args) { + _lf_message_print(1, "FATAL ERROR: ", format, args, LOG_LEVEL_ERROR); +} + /** * Internal implementation of the next few reporting functions. */ @@ -134,11 +141,8 @@ void _lf_message_print( #endif // STANDALONE_RTI } if (print_message_function == NULL) { - if (is_error) { - vfprintf(stderr, message, args); - } else { - vfprintf(stdout, message, args); - } + // NOTE: Send all messages to stdout, not to stderr, so that ordering makes sense. + vfprintf(stdout, message, args); } else { (*print_message_function)(message, args); } @@ -204,13 +208,19 @@ void lf_vprint_warning(const char* format, va_list args) { void lf_print_error_and_exit(const char* format, ...) { va_list args; va_start (args, format); - lf_vprint_error_and_exit(format, args); + lf_vprint_fatal_error(format, args); va_end (args); + fflush(stdout); exit(EXIT_FAILURE); } -void lf_vprint_error_and_exit(const char* format, va_list args) { - _lf_message_print(1, "FATAL ERROR: ", format, args, LOG_LEVEL_ERROR); +void lf_print_error_system_failure(const char* format, ...) { + va_list args; + va_start (args, format); + lf_vprint_error(format, args); + va_end (args); + lf_print_error_and_exit("Error %d: %s", errno, strerror(errno)); + exit(EXIT_FAILURE); } void lf_register_print_function(print_message_function_t* function, int log_level) { diff --git a/include/core/environment.h b/include/core/environment.h index d4852ddca..8670b8213 100644 --- a/include/core/environment.h +++ b/include/core/environment.h @@ -67,6 +67,7 @@ typedef struct enclave_info_t enclave_info_t; */ typedef struct environment_t { bool initialized; + bool execution_started; // Events at the start tag have been pulled from the event queue. char *name; int id; tag_t current_tag; diff --git a/include/core/federated/clock-sync.h b/include/core/federated/clock-sync.h index eb3e4c341..0106afc54 100644 --- a/include/core/federated/clock-sync.h +++ b/include/core/federated/clock-sync.h @@ -149,9 +149,9 @@ uint16_t setup_clock_synchronization_with_rti(void); * Failing to complete this protocol is treated as a catastrophic * error that causes the federate to exit. * - * @param rti_socket_TCP The rti's socket + * @param rti_socket_TCP Pointer to the RTI's socket */ -void synchronize_initial_physical_clock_with_rti(int rti_socket_TCP); +void synchronize_initial_physical_clock_with_rti(int* rti_socket_TCP); /** * Handle a clock synchroninzation message T1 coming from the RTI. diff --git a/include/core/federated/federate.h b/include/core/federated/federate.h index 880408ec6..e035d94c0 100644 --- a/include/core/federated/federate.h +++ b/include/core/federated/federate.h @@ -1,32 +1,12 @@ /** * @file - * @author Edward A. Lee (eal@berkeley.edu) - * - * @section LICENSE -Copyright (c) 2020, The University of California at Berkeley. - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY -EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL -THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF -THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - * @section DESCRIPTION - * Data structures and functions used and defined in federate.c. + * @author Soroush Bateni + * @author Peter Donovan + * @author Edward A. Lee + * @author Anirudh Rengarajsm + * @copyright (c) 2020-2023, The University of California at Berkeley. + * License: BSD 2-clause + * @brief Data structures and functions used and defined in federate.c. */ #ifndef FEDERATE_H @@ -43,13 +23,16 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ADVANCE_MESSAGE_INTERVAL MSEC(10) #endif +////////////////////////////////////////////////////////////////////////////////// +// Data types + /** * Structure that a federate instance uses to keep track of its own state. */ typedef struct federate_instance_t { /** * The TCP socket descriptor for this federate to communicate with the RTI. - * This is set by connect_to_rti(), which must be called before other + * This is set by lf_connect_to_rti(), which must be called before other * functions that communicate with the rti are called. */ int socket_TCP_RTI; @@ -59,14 +42,6 @@ typedef struct federate_instance_t { */ lf_thread_t RTI_socket_listener; - /** - * Thread responsible for setting ports to absent by an STAA offset if they - * aren't already known. - */ - #ifdef FEDERATED_DECENTRALIZED - lf_thread_t staaSetter; - #endif - /** * Number of inbound physical connections to the federate. * This can be either physical connections, or logical connections @@ -92,7 +67,7 @@ typedef struct federate_instance_t { * An array that holds the socket descriptors for inbound * connections from each federate. The index will be the federate * ID of the remote sending federate. This is initialized at startup - * to -1 and is set to a socket ID by handle_p2p_connections_from_federates() + * to -1 and is set to a socket ID by lf_handle_p2p_connections_from_federates() * when the socket is opened. * * @note There will not be an inbound socket unless a physical connection @@ -107,7 +82,7 @@ typedef struct federate_instance_t { * An array that holds the socket descriptors for outbound direct * connections to each remote federate. The index will be the federate * ID of the remote receiving federate. This is initialized at startup - * to -1 and is set to a socket ID by connect_to_federate() + * to -1 and is set to a socket ID by lf_connect_to_federate() * when the socket is opened. * * @note This federate will not open an outbound socket unless a physical @@ -126,7 +101,7 @@ typedef struct federate_instance_t { /** * A socket descriptor for the socket server of the federate. - * This is assigned in create_server(). + * This is assigned in lf_create_server(). * This socket is used to listen to incoming physical connections from * remote federates. Once an incoming connection is accepted, the * opened socket will be stored in @@ -135,26 +110,21 @@ typedef struct federate_instance_t { int server_socket; /** - * The port used for the server socket - * to listen for messages from other federates. - * The federate informs the RTI of this port once - * it has created its socket server by sending - * an ADDRESS_AD message (@see rti.h). + * The port used for the server socket to listen for messages from other federates. + * The federate informs the RTI of this port once it has created its socket server by + * sending an ADDRESS_AD message (@see rti.h). */ int server_port; /** - * Most recent TIME_ADVANCE_GRANT received from the RTI, or NEVER if none - * has been received. - * This is used to communicate between the listen_to_rti_TCP thread and the - * main federate thread. - * This variable should only be accessed while holding the mutex lock. + * Most recent tag advance grant (TAG) received from the RTI, or NEVER if none + * has been received. This variable should only be accessed while holding the + * mutex lock on the top-level environment. */ tag_t last_TAG; /** - * Indicates whether the last TAG received is provisional or an ordinary - * TAG. + * Indicates whether the last TAG received is provisional or an ordinary TAG. * If the last TAG has been provisional, network port absent reactions must be inserted. * This variable should only be accessed while holding the mutex lock. */ @@ -180,13 +150,12 @@ typedef struct federate_instance_t { bool received_stop_request_from_rti; /** - * A record of the most recently sent LTC (logical tag complete) message. + * A record of the most recently sent LTC (latest tag complete) message. * In some situations, federates can send logical_tag_complete for * the same tag twice or more in-a-row to the RTI. For example, when * _lf_next() returns without advancing tag. To prevent overwhelming * the RTI with extra messages, record the last sent logical tag - * complete message and check against it in - * _lf_logical_tag_complete(). + * complete message and check against it in lf_latest_tag_complete(). * * @note Here, the underlying assumption is that the TCP stack will * deliver the Logical TAG Complete message to the RTI eventually @@ -207,15 +176,25 @@ typedef struct federate_instance_t { */ instant_t min_delay_from_physical_action_to_federate_output; - // Trace object + /** + * Trace object for this federate, used if tracing is enabled. + */ trace_t* trace; + + #ifdef FEDERATED_DECENTRALIZED + /** + * Thread responsible for setting ports to absent by an STAA offset if they + * aren't already known. + */ + lf_thread_t staaSetter; + #endif } federate_instance_t; #ifdef FEDERATED_DECENTRALIZED -typedef struct staa { +typedef struct staa_t { lf_action_base_t** actions; size_t STAA; - size_t numActions; + size_t num_actions; } staa_t; #endif @@ -226,31 +205,39 @@ typedef struct federation_metadata_t { char* rti_user; } federation_metadata_t; -extern lf_mutex_t outbound_socket_mutex; -extern lf_cond_t port_status_changed; -extern lf_cond_t logical_time_changed; +typedef enum parse_rti_code_t { + SUCCESS, + INVALID_PORT, + INVALID_HOST, + INVALID_USER, + FAILED_TO_PARSE +} parse_rti_code_t; + +////////////////////////////////////////////////////////////////////////////////// +// Global variables /** -* Generated function that sends information about connections between this federate and -* other federates where messages are routed through the RTI. Currently, this -* only includes logical connections when the coordination is centralized. This -* information is needed for the RTI to perform the centralized coordination. -* @see MSG_TYPE_NEIGHBOR_STRUCTURE in net_common.h -*/ -void send_neighbor_structure_to_RTI(int); + * Mutex lock held while performing socket write and close operations. + */ +extern lf_mutex_t lf_outbound_socket_mutex; /** - * @brief Spawns a thread to iterate through STAA structs, setting its associated ports absent - * at an offset if the port is not present with a value by a certain physical time. - * + * Condition variable for blocking on unkonwn federate input ports. */ -#ifdef FEDERATED_DECENTRALIZED -void spawn_staa_thread(void); -#endif +extern lf_cond_t lf_port_status_changed; + +/** + * Condition variable for blocking on tag advance in +*/ +extern lf_cond_t lf_current_tag_changed; + +////////////////////////////////////////////////////////////////////////////////// +// Public functions (in alphabetical order) /** - * Connect to the federate with the specified id. This established - * connection will then be used in functions such as send_timed_message() + * @brief Connect to the federate with the specified id. + * + * The established connection will then be used in functions such as lf_send_tagged_message() * to send messages directly to the specified federate. * This function first sends an MSG_TYPE_ADDRESS_QUERY message to the RTI to obtain * the IP address and port number of the specified federate. It then attempts @@ -260,145 +247,109 @@ void spawn_staa_thread(void); * refer to the socket for communicating directly with the federate. * @param remote_federate_id The ID of the remote federate. */ -void connect_to_federate(uint16_t); - -/** - * Send a logical tag complete (LTC) message to the RTI - * unless an equal or later LTC has previously been sent. - * This function assumes the caller holds the mutex lock. - * - * @param tag_to_send The tag to send. - */ -void _lf_logical_tag_complete(tag_t); +void lf_connect_to_federate(uint16_t); /** - * Connect to the RTI at the specified host and port and return - * the socket descriptor for the connection. If this fails, the - * program exits. If it succeeds, it sets the _fed.socket_TCP_RTI global - * variable to refer to the socket for communicating with the RTI. + * @brief Connect to the RTI at the specified host and port. + * + * This will return the socket descriptor for the connection. + * If port_number is 0, then start at DEFAULT_PORT and increment + * the port number on each attempt. If an attempt fails, wait CONNECT_RETRY_INTERVAL + * and try again. If it fails after CONNECT_MAX_RETRIES, the program exits. + * If it succeeds, it sets the _fed.socket_TCP_RTI global variable to refer to + * the socket for communicating with the RTI. * @param hostname A hostname, such as "localhost". - * @param port_number A port number. + * @param port_number A port number or 0 to start with the default. */ -void connect_to_rti(const char*, int); +void lf_connect_to_rti(const char* hostname, int port_number); /** - * Thread that listens for inputs from other federates. - * This thread listens for messages of type MSG_TYPE_P2P_MESSAGE, - * MSG_TYPE_P2P_TAGGED_MESSAGE, or MSG_TYPE_PORT_ABSENT (@see net_common.h) from the specified - * peer federate and calls the appropriate handling function for - * each message type. If an error occurs or an EOF is received - * from the peer, then this procedure sets the corresponding - * socket in _fed.sockets_for_inbound_p2p_connections - * to -1 and returns, terminating the thread. - * @param fed_id_ptr A pointer to a uint16_t containing federate ID being listened to. - * This procedure frees the memory pointed to before returning. + * @brief Create a server to listen to incoming P2P connections. + * + * Such connections are used for physical connections or any connection if using + * decentralized coordination. This function only handles the creation of the server socket. + * The bound port for the server socket is then sent to the RTI by sending an + * MSG_TYPE_ADDRESS_ADVERTISEMENT message (@see net_common.h). + * This function expects no response from the RTI. + * + * If a port is specified by the user, that will be used. + * Otherwise, a random port will be assigned. If the bind fails, + * it will retry after PORT_BIND_RETRY_INTERVAL until it has tried + * PORT_BIND_RETRY_LIMIT times. Then it will fail. + * + * @param specified_port The port specified by the user or 0 to use a random port. */ -void* listen_to_federates(void*); +void lf_create_server(int specified_port); /** - * Create a server to listen to incoming physical - * connections from remote federates. This function - * only handles the creation of the server socket. - * The reserved port for the server socket is then - * sent to the RTI by sending an MSG_TYPE_ADDRESS_ADVERTISEMENT message - * (@see net_common.h). This function expects no response - * from the RTI. - * - * If a port is specified by the user, that will be used - * as the only possibility for the server. This function - * will fail if that port is not available. If a port is not - * specified, the STARTING_PORT (@see net_common.h) will be used. - * The function will keep incrementing the port in this case - * until the number of tries reaches PORT_RANGE_LIMIT. - * - * @note This function is similar to create_server(...) in rti.c. - * However, it contains specific log messages for the peer to - * peer connections between federates. It also additionally - * sends an address advertisement (MSG_TYPE_ADDRESS_ADVERTISEMENT) message to the - * RTI informing it of the port. - * - * @param specified_port The specified port by the user. + * @brief Enqueue port absent reactions. + * + * These reactions will send a MSG_TYPE_PORT_ABSENT + * message to downstream federates if a given network output port is not present. + * @param env The environment of the federate */ -void create_server(int specified_port); +void lf_enqueue_port_absent_reactions(environment_t* env); /** - * Thread to accept connections from other federates that send this federate - * messages directly (not through the RTI). This thread starts a thread for - * each accepted socket connection and, once it has opened all expected + * @brief Thread to accept connections from other federates. + * + * This thread accepts connections from federates that send messages directly + * to this one (not through the RTI). This thread starts a thread for + * each accepted socket connection to read messages and, once it has opened all expected * sockets, exits. * @param ignored No argument needed for this thread. */ -void* handle_p2p_connections_from_federates(void*); +void* lf_handle_p2p_connections_from_federates(void*); /** - * Send a port absent message to federate with fed_ID, informing the - * remote federate that the current federate will not produce an event - * on this network port at the current logical time. + * @brief Send a latest tag complete (LTC) signal to the RTI. + * + * This avoids the send if an equal or later LTC has previously been sent. + * + * This function assumes the caller holds the mutex lock + * on the top-level environment. * - * @param env The environment in which we are executing - * @param additional_delay The offset applied to the timestamp - * using after. The additional delay will be greater or equal to zero - * if an after is used on the connection. If no after is given in the - * program, -1 is passed. - * @param port_ID The ID of the receiving port. - * @param fed_ID The fed ID of the receiving federate. - */ -void send_port_absent_to_federate(environment_t* env, interval_t, unsigned short, unsigned short); - -/** - * Enqueue port absent reactions that will send a PORT_ABSENT - * message to downstream federates if a given network output port is not present. + * @param tag_to_send The tag to send. */ -void enqueue_port_absent_reactions(environment_t* env); +void lf_latest_tag_complete(tag_t); /** - * @brief Wait until inputs statuses are known up to and including the specified level. - * Specifically, wait until the specified level is less that the max level allowed to - * advance (MLAA). - * @param env The environment (which should always be the top-level environment). - * @param level The level to which we would like to advance. + * @brief Parse the address of the RTI and store them into the global federation_metadata struct. + * @return a parse_rti_code_t indicating the result of the parse. */ -void stall_advance_level_federation(environment_t* env, size_t level); +parse_rti_code_t lf_parse_rti_addr(const char* rti_addr); /** - * @brief Update the max level allowed to advance (MLAA). - * If the specified tag is greater than the current_tag of the top-level environment - * (or equal and is_provisional is false), then set the MLAA to MAX_INT and return. - * This removes any barriers on execution at the current tag due to network inputs. - * Otherwise, set the MLAA to the minimum level over all (non-physical) network input ports - * where the status of the input port is not known at that current_tag. + * @brief Reset the status fields on network input ports to unknown or absent. * - * This function assumes that the caller holds the mutex. - * - * @param tag The latest TAG or PTAG received by this federate. - * @param is_provisional Whether the tag was provisional. - * @return True if the MLAA changed. + * This will reset to absent if the last_known_status_tag field of the port + * is greater than or equal to the current tag of the top-level environment. + * This should be overriden to present if an event gets scheduled. + * Otherwise, set the status to unknown. + * @note This function must be called at the beginning of each + * logical time. */ -bool update_max_level(tag_t tag, bool is_provisional); +void lf_reset_status_fields_on_input_port_triggers(); /** - * Send a message to another federate directly or via the RTI. - * This method assumes that the caller does not hold the outbound_socket_mutex lock, + * @brief Send a message to another federate. + * + * This function is used for physical connections + * between federates. If the socket connection to the remote federate or the RTI has been broken, + * then this returns -1 without sending. Otherwise, it returns 0. + * + * This method assumes that the caller does not hold the lf_outbound_socket_mutex lock, * which it acquires to perform the send. * - * If the socket connection to the remote federate or the RTI has been broken, - * then this returns 0 without sending. Otherwise, it returns 1. - * - * @note This function is similar to send_timed_message() except that it - * does not deal with time and timed_messages. - * - * @param message_type The type of the message being sent. - * Currently can be MSG_TYPE_TAGGED_MESSAGE for messages sent via - * RTI or MSG_TYPE_P2P_TAGGED_MESSAGE for messages sent between - * federates. + * @param message_type The type of the message being sent (currently only MSG_TYPE_P2P_MESSAGE). * @param port The ID of the destination port. * @param federate The ID of the destination federate. - * @param next_destination_str The name of the next destination in string format + * @param next_destination_str The name of the next destination in string format (for reporting). * @param length The message length. * @param message The message. - * @return 1 if the message has been sent, 0 otherwise. + * @return 0 if the message has been sent, -1 otherwise. */ -int send_message(int message_type, +int lf_send_message(int message_type, unsigned short port, unsigned short federate, const char* next_destination_str, @@ -406,70 +357,201 @@ int send_message(int message_type, unsigned char* message); /** - * Send the specified timestamped message to the specified port in the - * specified federate via the RTI or directly to a federate depending on - * the given socket. The timestamp is calculated as current_logical_time + - * additional delay which is greater than or equal to zero. - * The port should be an input port of a reactor in - * the destination federate. This version does include the timestamp - * in the message. The caller can reuse or free the memory after this returns. + * @brief Send information about connections to the RTI. + * + * This is a generated function that sends information about connections between this federate + * and other federates where messages are routed through the RTI. Currently, this + * only includes logical connections when the coordination is centralized. This + * information is needed for the RTI to perform the centralized coordination. + * @see MSG_TYPE_NEIGHBOR_STRUCTURE in net_common.h + */ +void lf_send_neighbor_structure_to_RTI(int); + +/** + * @brief Send a next event tag (NET) signal. + * + * If this federate depends on upstream federates or sends data to downstream + * federates, then send to the RTI a NET, which will give the tag of the + * earliest event on the event queue, or, if the queue is empty, the timeout + * time, or, if there is no timeout, FOREVER. * - * If the socket connection to the remote federate or the RTI has been broken, - * then this returns 0 without sending. Otherwise, it returns 1. + * If there are network outputs that + * depend on physical actions, then insert a dummy event to ensure this federate + * advances its tag so that downstream federates can make progress. * - * This method assumes that the caller does not hold the outbound_socket_mutex lock, - * which it acquires to perform the send. + * A NET is a promise saying that, absent network inputs, this federate will + * not produce an output message with tag earlier than the NET value. + * + * If there are upstream federates, then after sending a NET, this will block + * until either the RTI grants the advance to the requested time or the wait + * for the response from the RTI is interrupted by a change in the event queue + * (e.g., a physical action triggered or a network message arrived). + * If there are no upstream federates, then it will not wait for a TAG + * (which won't be forthcoming anyway) and returns the earliest tag on the event queue. + * + * If the federate has neither upstream nor downstream federates, then this + * returns the specified tag immediately without sending anything to the RTI. + * + * If there is at least one physical action somewhere in the federate that can + * trigger an output to a downstream federate, then the NET is required to be + * less than the current physical time. If physical time is less than the + * earliest event in the event queue (or the event queue is empty), then this + * function will insert a dummy event with a tag equal to the current physical + * time (and a microstep of 0). This will enforce advancement of tag for this + * federate and causes a NET message to be sent repeatedly as physical time + * advances with the time interval between messages controlled by the target + * parameter coordination-options: {advance-message-interval timevalue}. It will + * stop creating dummy events if and when its event queue has an event with a + * timestamp less than physical time. + * + * If wait_for_reply is false, then this function will simply send the + * specified tag and return that tag immediately. This is useful when a + * federate is shutting down and will not be sending any more messages at all. + * + * In all cases, this returns either the specified tag or + * another tag when it is safe to advance logical time to the returned tag. + * The returned tag may be less than the specified tag if there are upstream + * federates and either the RTI responds with a lesser tag or + * the wait for a response from the RTI is interrupted by a + * change in the event queue. + * + * This function is used in centralized coordination only. + * + * This function assumes the caller holds the mutex lock. + * + * @param env The environment of the federate + * @param tag The tag. + * @param wait_for_reply If true, wait for a reply. + */ +tag_t lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply); + +/** + * @brief Send a port absent message. + * + * This informs the remote federate that it will not receive a message with tag less than the + * current tag of the specified environment delayed by the additional_delay. + * + * @param env The environment from which to get the current tag. + * @param additional_delay The after delay of the connection or NEVER if none. + * @param port_ID The ID of the receiving port. + * @param fed_ID The fed ID of the receiving federate. + */ +void lf_send_port_absent_to_federate( + environment_t* env, + interval_t additional_delay, + unsigned short port_ID, + unsigned short fed_ID); + +/** + * @brief Send a MSG_TYPE_STOP_REQUEST message to the RTI. + * + * The payload is the specified tag plus one microstep. If this federate has previously + * received a stop request from the RTI, then do not send the message and + * return 1. Return -1 if the socket is disconnected. Otherwise, return 0. + * @return 0 if the message is sent. + */ +int lf_send_stop_request_to_rti(tag_t stop_tag); + +/** + * @brief Send a tagged message to the specified port of the specified federate. + * + * The tag will be the current tag of the specified environment delayed by the specified additional_delay. + * If the delayed tag falls after the timeout time, then the message is not sent and -1 is returned. + * The caller can reuse or free the memory storing the message after this returns. + * + * If the message fails to send (e.g. the socket connection is broken), then the + * response depends on the message_type. For MSG_TYPE_TAGGED_MESSAGE, the message is + * supposed to go via the RTI, and failure to communicate with the RTI is a critical failure. + * In this case, the program will exit with an error message. If the message type is + * MSG_TYPE_P2P_TAGGED_MESSAGE, then the failure is not critical. It may be due to the + * remote federate having exited, for example, because its safe-to-process offset led it + * to believe that there were no messages forthcoming. In this case, on failure to send + * the message, this function returns -11. * - * @note This function is similar to send_message() except that it - * sends timed messages and also contains logics related to time. + * This method assumes that the caller does not hold the lf_outbound_socket_mutex lock, + * which it acquires to perform the send. * - * @param env The environment in which we are executing - * @param additional_delay The offset applied to the timestamp - * using after. The additional delay will be greater or equal to zero - * if an after is used on the connection. If no after is given in the - * program, -1 is passed. - * @param message_type The type of the message being sent. - * Currently can be MSG_TYPE_TAGGED_MESSAGE for messages sent via - * RTI or MSG_TYPE_P2P_TAGGED_MESSAGE for messages sent between - * federates. + * @param env The environment from which to get the current tag. + * @param additional_delay The after delay on the connection or NEVER is there is none. + * @param message_type The type of the message being sent. Currently can be + * MSG_TYPE_TAGGED_MESSAGE for messages sent via the RTI or MSG_TYPE_P2P_TAGGED_MESSAGE + * for messages sent directly between federates. * @param port The ID of the destination port. * @param federate The ID of the destination federate. * @param next_destination_str The next destination in string format (RTI or federate) * (used for reporting errors). * @param length The message length. * @param message The message. - * @return 1 if the message has been sent, 0 otherwise. + * @return 0 if the message has been sent, 1 otherwise. + */ +int lf_send_tagged_message( + environment_t* env, + interval_t additional_delay, + int message_type, + unsigned short port, + unsigned short federate, + const char* next_destination_str, + size_t length, + unsigned char* message); + +/** + * @brief Set the federation_id of this federate. + * @param fid The federation ID. + */ +void lf_set_federation_id(const char* fid); + +/** + * @brief Set the trace object for this federate (used when tracing is enabled). + * + * @param The trace object. + */ +void lf_set_federation_trace_object(trace_t * trace); + +#ifdef FEDERATED_DECENTRALIZED +/** + * @brief Spawn a thread to iterate through STAA structs. + * + * This will set their associated ports absent + * at an offset if the port is not present with a value by a certain physical time. + */ +void lf_spawn_staa_thread(void); +#endif + +/** + * @brief Wait until inputs statuses are known up to and including the specified level. + * + * Specifically, wait until the specified level is less that the max level allowed to + * advance (MLAA). + * @param env The environment (which should always be the top-level environment). + * @param level The level to which we would like to advance. */ -int send_timed_message(environment_t*, - interval_t, - int, - unsigned short, - unsigned short, - const char*, - size_t, - unsigned char*); +void lf_stall_advance_level_federation(environment_t* env, size_t level); /** - * Synchronize the start with other federates via the RTI. + * @brief Synchronize the start with other federates via the RTI. + * * This assumes that a connection to the RTI is already made * and _lf_rti_socket_TCP is valid. It then sends the current logical * time to the RTI and waits for the RTI to respond with a specified * time. It starts a thread to listen for messages from the RTI. */ -void synchronize_with_other_federates(); +void lf_synchronize_with_other_federates(); /** - * Wait until the status of network port "port_ID" is known. - * - * In decentralized coordination mode, the wait time is capped by STAA + STA, - * after which the status of the port is presumed to be absent. + * @brief Update the max level allowed to advance (MLAA). + * + * If the specified tag is greater than the current_tag of the top-level environment + * (or equal and is_provisional is false), then set the MLAA to INT_MAX and return. + * This removes any barriers on execution at the current tag due to network inputs. + * Otherwise, set the MLAA to the minimum level over all (non-physical) network input ports + * where the status of the input port is not known at that current_tag. * - * This function assumes the holder does not hold a mutex. + * This function assumes that the caller holds the mutex. * - * @param env The environment in which we are executing - * @param port_ID The ID of the network port - * @param STAA The safe-to-assume-absent threshold for the port + * @param tag The latest TAG or PTAG received by this federate. + * @param is_provisional Whether the tag was provisional. + * @return True if the MLAA changed. */ -void wait_until_port_status_known(environment_t* env, int portID, interval_t STAA); +bool lf_update_max_level(tag_t tag, bool is_provisional); #endif // FEDERATE_H diff --git a/include/core/federated/network/net_common.h b/include/core/federated/network/net_common.h index 38001cc0b..9ea720fd7 100644 --- a/include/core/federated/network/net_common.h +++ b/include/core/federated/network/net_common.h @@ -37,12 +37,9 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Each federate attempts to connect with an RTI at the IP address * put into its code by the code generator (i.e., it attempts to - * open a TCP connection). It starts by trying the - * port number given by STARTING_PORT and increments the port number - * from there until it successfully connects. The maximum port number - * it will try before giving up is STARTING_PORT + PORT_RANGE_LIMIT. - * - * FIXME: What if a port is specified in the "at" of the federated statement? + * open a TCP connection). If an explicit port is given in the `at` clause + * on the `federated reactor` statement, it will use that port. Otherwise, it will + * use DEFAULT_PORT. * * When it has successfully opened a TCP connection, the first message it sends * to the RTI is a MSG_TYPE_FED_IDS message, which contains the ID of this federate @@ -137,9 +134,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * parameter of the target is "decentralized" and the federate has * inbound connections from other federates, then it starts a socket * server to listen for incoming connections from those federates. - * It attempts to create the server at the port given by STARTING_PORT, - * and if this fails, increments the port number from there until a - * port is available. It then sends to the RTI an MSG_TYPE_ADDRESS_ADVERTISEMENT message + * It then sends to the RTI an MSG_TYPE_ADDRESS_ADVERTISEMENT message * with the port number as a payload. The federate then creates a thread * to listen for incoming socket connections and messages. * @@ -156,18 +151,6 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Physical connections also use the above P2P sockets between * federates even if the coordination is centralized. * - * Note: Peer-to-peer sockets can be closed by the downstream federate. - * For example, when a downstream federate reaches its stop time, then - * it will stop accepting physical messages. To achieve an orderly shutdown, - * the downstream federate sends a MSG_TYPE_CLOSE_REQUEST message to the upstream - * one and the upstream federate handles closing the socket. This way, any - * messages that are in the middle of being sent while the downstream - * federate shuts down will successfully traverse the socket, even if - * only to be ignored by the downstream federate. It is valid to ignore - * such messages if the connection is physical or if the coordination is - * decentralized and the messages arrive after the STP offset of the - * downstream federate (i.e., they are "tardy"). - * * Afterward, the federates and the RTI decide on a common start time by having * each federate report a reading of its physical clock to the RTI on a * `MSG_TYPE_TIMESTAMP`. The RTI broadcasts the maximum of these readings plus @@ -180,7 +163,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * each federate has a valid event at the start tag (start time, 0) and it will * inform the RTI of this event. * Subsequently, at the conclusion of each tag, each federate will send a - * `MSG_TYPE_LOGICAL_TAG_COMPLETE` followed by a `MSG_TYPE_NEXT_EVENT_TAG` (see + * `MSG_TYPE_LATEST_TAG_COMPLETE` followed by a `MSG_TYPE_NEXT_EVENT_TAG` (see * the comment for each message for further explanation). Each federate would * have to wait for a `MSG_TYPE_TAG_ADVANCE_GRANT` or a * `MSG_TYPE_PROVISIONAL_TAG_ADVANCE_GRANT` before it can advance to a @@ -208,7 +191,6 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define UDP_TIMEOUT_TIME SEC(1) - /** * Size of the buffer used for messages sent between federates. * This is used by both the federates and the rti, so message lengths @@ -217,63 +199,61 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FED_COM_BUFFER_SIZE 256u /** - * Number of nanoseconds that elapse between a federate's attempts - * to connect to the RTI. + * Time between a federate's attempts to connect to the RTI. */ -#define CONNECT_RETRY_INTERVAL 2000000000LL +#define CONNECT_RETRY_INTERVAL MSEC(500) /** * Bound on the number of retries to connect to the RTI. * A federate will retry every CONNECT_RETRY_INTERVAL seconds - * this many times before giving up. E.g., 500 retries every - * 2 seconds results in retrying for about 16 minutes. + * this many times before giving up. + */ +#define CONNECT_MAX_RETRIES 100 + +/** + * Maximum number of port addresses that a federate will try to connect to the RTI on. + * If you are using automatic ports begining at DEFAULT_PORT, this puts an upper bound + * on the number of RTIs that can be running on the same host. */ -#define CONNECT_NUM_RETRIES 500 +#define MAX_NUM_PORT_ADDRESSES 16 /** - * Number of nanoseconds that a federate waits before asking + * Time that a federate waits before asking * the RTI again for the port and IP address of a federate * (an MSG_TYPE_ADDRESS_QUERY message) after the RTI responds that it - * does not know. + * does not know. This allows time for federates to start separately. */ -#define ADDRESS_QUERY_RETRY_INTERVAL 100000000LL +#define ADDRESS_QUERY_RETRY_INTERVAL MSEC(250) /** - * Number of nanoseconds that a federate waits before trying - * another port for the RTI. This is to avoid overwhelming - * the OS and the socket with too many calls. - * FIXME: Is this too small? + * Time to wait before re-attempting to bind to a port. + * When a process closes, the network stack typically waits between 30 and 120 + * seconds before releasing the port. This is to allow for delayed packets so + * that a new process does not receive packets from a previous process. + * Here, we limit the retries to 60 seconds. */ -#define PORT_KNOCKING_RETRY_INTERVAL 10000LL +#define PORT_BIND_RETRY_INTERVAL SEC(1) /** - * Default starting port number for the RTI and federates' socket server. - * Unless a specific port has been specified by the LF program in the "at" - * for the RTI, when the federates start up, they will attempt - * to open a socket server - * on this port, and, if this fails, increment the port number and - * try again. The number of increments is limited by PORT_RANGE_LIMIT. - * FIXME: Clarify what happens if a specific port has been given in "at". + * Number of attempts to bind to a port before giving up. */ -#define STARTING_PORT 15045u +#define PORT_BIND_RETRY_LIMIT 60 /** - * Number of ports to try to connect to. Unless the LF program specifies - * a specific port number to use, the RTI or federates will attempt to start - * a socket server on port STARTING_PORT. If that port is not available (e.g., - * another RTI is running or has recently exited), then it will try the - * next port, STARTING_PORT+1, and keep incrementing the port number up to this - * limit. If no port between STARTING_PORT and STARTING_PORT + PORT_RANGE_LIMIT - * is available, then the RTI or the federate will fail to start. This number, therefore, - * limits the number of RTIs and federates that can be simultaneously - * running on any given machine without assigning specific port numbers. + * Default port number for the RTI. + * Unless a specific port has been specified by the LF program in the "at" + * for the RTI or on the command line, when the RTI starts up, it will attempt + * to open a socket server on this port. */ -#define PORT_RANGE_LIMIT 1024 +#define DEFAULT_PORT 15045u /** * Delay the start of all federates by this amount. - * FIXME: More. - * FIXME: Should use the latency estimates that were + * This helps ensure that the federates do not start at the same time. + * Each federate has provided its current physical time to the RTI, and + * the RTI has picked the largest of these. It will add this quantity + * and declare that to be the start time. + * FIXME: This could use the latency estimates that were * acquired during initial clock synchronization. */ #define DELAY_START SEC(1) @@ -323,7 +303,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * to the RTI. This is its first message to the RTI. * The RTI will respond with either MSG_TYPE_REJECT, MSG_TYPE_ACK, or MSG_TYPE_UDP_PORT. * If the federate is a C target LF program, the generated federate - * code does this by calling synchronize_with_other_federates(), + * code does this by calling lf_synchronize_with_other_federates(), * passing to it its federate ID. */ #define MSG_TYPE_FED_IDS 1 @@ -397,20 +377,23 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define MSG_TYPE_MESSAGE 3 -/** Byte identifying that the federate is ending its execution. */ +/** + * Byte identifying that the federate or the RTI is ending its execution. + */ #define MSG_TYPE_RESIGN 4 -/** Byte identifying a timestamped message to forward to another federate. - * The next two bytes will be the ID of the destination reactor port. - * The next two bytes are the destination federate ID. - * The four bytes after that will be the length of the message. - * The next eight bytes will be the timestamp of the message. - * The next four bytes will be the microstep of the message. - * The remaining bytes are the message. +/** + * Byte identifying a timestamped message to forward to another federate. + * The next two bytes will be the ID of the destination reactor port. + * The next two bytes are the destination federate ID. + * The four bytes after that will be the length of the message. + * The next eight bytes will be the timestamp of the message. + * The next four bytes will be the microstep of the message. + * The remaining bytes are the message. * - * With centralized coordination, all such messages flow through the RTI. - * With decentralized coordination, tagged messages are sent peer-to-peer - * between federates and are marked with MSG_TYPE_P2P_TAGGED_MESSAGE. + * With centralized coordination, all such messages flow through the RTI. + * With decentralized coordination, tagged messages are sent peer-to-peer + * between federates and are marked with MSG_TYPE_P2P_TAGGED_MESSAGE. */ #define MSG_TYPE_TAGGED_MESSAGE 5 @@ -451,12 +434,12 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MSG_TYPE_PROVISIONAL_TAG_ADVANCE_GRANT 8 /** - * Byte identifying a logical tag complete (LTC) message sent by a federate + * Byte identifying a latest tag complete (LTC) message sent by a federate * to the RTI. * The next eight bytes will be the timestep of the completed tag. * The next four bytes will be the microsteps of the completed tag. */ -#define MSG_TYPE_LOGICAL_TAG_COMPLETE 9 +#define MSG_TYPE_LATEST_TAG_COMPLETE 9 /////////// Messages used in lf_request_stop() /////////////// //// Overview of the algorithm: @@ -598,14 +581,6 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define MSG_TYPE_P2P_TAGGED_MESSAGE 17 -/** - * Byte identifying a message that a downstream federate sends to its - * upstream counterpart to request that the socket connection be closed. - * This is the only message that should flow upstream on such socket - * connections. - */ -#define MSG_TYPE_CLOSE_REQUEST 18 - //////////////////////////////////////////////// /** * Physical clock synchronization messages according to PTP. @@ -685,6 +660,11 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MSG_TYPE_NEIGHBOR_STRUCTURE 24 #define MSG_TYPE_NEIGHBOR_STRUCTURE_HEADER_SIZE 9 +/** + * Byte identifying that the federate or the RTI has failed. + */ +#define MSG_TYPE_FAILED 25 + ///////////////////////////////////////////// //// Rejection codes diff --git a/include/core/federated/network/net_util.h b/include/core/federated/network/net_util.h index 5c6bcb966..6346e21d3 100644 --- a/include/core/federated/network/net_util.h +++ b/include/core/federated/network/net_util.h @@ -51,6 +51,9 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "../../platform.h" #include "../../tag.h" +#define NUM_SOCKET_RETRIES 10 +#define DELAY_BETWEEN_SOCKET_RETRIES MSEC(100) + #define HOST_LITTLE_ENDIAN 1 #define HOST_BIG_ENDIAN 2 @@ -62,24 +65,55 @@ int host_is_big_endian(void); #ifdef FEDERATED +/** + * Mutex protecting socket close operations. + */ +extern lf_mutex_t socket_mutex; /** * @brief Create an IPv4 TCP socket with Nagle's algorithm disabled * (TCP_NODELAY) and Delayed ACKs disabled (TCP_QUICKACK). Exits application * on any error. * - * @return int + * @return The socket ID (a file descriptor). */ int create_real_time_tcp_socket_errexit(); +/** + * Read the specified number of bytes from the specified socket into the specified buffer. + * If an error occurs during this reading, return -1 and set errno to indicate + * the cause of the error. If the read succeeds in reading the specified number of bytes, + * return 0. If an EOF occurs before reading the specified number of bytes, return 1. + * This function repeats the read attempt until the specified number of bytes + * have been read, an EOF is read, or an error occurs. Specifically, errors EAGAIN, + * EWOULDBLOCK, and EINTR are not considered errors and instead trigger + * another attempt. A delay between attempts is given by DELAY_BETWEEN_SOCKET_RETRIES. + * @param socket The socket ID. + * @param num_bytes The number of bytes to read. + * @param buffer The buffer into which to put the bytes. + * @return 0 for success, 1 for EOF, and -1 for an error. + */ +int read_from_socket(int socket, size_t num_bytes, unsigned char* buffer); + +/** + * Read the specified number of bytes to the specified socket using read_from_socket + * and close the socket if an error occurs. If an error occurs, this will change the + * socket ID pointed to by the first argument to -1 and will return -1. + * @param socket Pointer to the socket ID. + * @param num_bytes The number of bytes to write. + * @param buffer The buffer from which to get the bytes. + * @return 0 for success, -1 for failure. + */ +int read_from_socket_close_on_error(int* socket, size_t num_bytes, unsigned char* buffer); + /** * Read the specified number of bytes from the specified socket into the * specified buffer. If a disconnect or an EOF occurs during this * reading, then if format is non-null, report an error and exit. + * If the mutex argument is non-NULL, release the mutex before exiting. * If format is null, then report the error, but do not exit. - * This function takes a formatted - * string and additional optional arguments similar to printf(format, ...) - * that is appended to the error messages. + * This function takes a formatted string and additional optional arguments + * similar to printf(format, ...) that is appended to the error messages. * @param socket The socket ID. * @param num_bytes The number of bytes to read. * @param buffer The buffer into which to put the bytes. @@ -88,88 +122,72 @@ int create_real_time_tcp_socket_errexit(); * @return The number of bytes read, or 0 if an EOF is received, or * a negative number for an error. */ -ssize_t read_from_socket_errexit( - int socket, +void read_from_socket_fail_on_error( + int* socket, size_t num_bytes, unsigned char* buffer, + lf_mutex_t* mutex, char* format, ...); -ssize_t write_to_socket(int socket, size_t num_bytes, unsigned char* buffer); - /** - * Read the specified number of bytes from the specified socket into the - * specified buffer. If a disconnect occurs during this - * reading, return a negative number. If an EOF occurs during this - * reading, return 0. Otherwise, return the number of bytes read. - * This is a version of read_from_socket_errexit() that does not error out. + * Without blocking, peek at the specified socket and, if there is + * anything on the queue, put its first byte at the specified address and return 1. + * If there is nothing on the queue, return 0, and if an error occurs, + * return -1. * @param socket The socket ID. - * @param num_bytes The number of bytes to read. - * @param buffer The buffer into which to put the bytes. - * @return The number of bytes read or 0 when EOF is received or negative for an error. + * @param result Pointer to where to put the first byte available on the socket. */ -ssize_t read_from_socket(int socket, size_t num_bytes, unsigned char* buffer); +ssize_t peek_from_socket(int socket, unsigned char* result); /** * Write the specified number of bytes to the specified socket from the - * specified buffer. If a disconnect or an EOF occurs during this - * reading, report an error and exit, unless the format string is NULL, - * in which case, report an error and return. This function takes a formatted - * string and additional optional arguments similar to printf(format, ...) - * that is appended to the error messages. + * specified buffer. If an error occurs, return -1 and set errno to indicate + * the cause of the error. If the write succeeds, return 0. + * This function repeats the attempt until the specified number of bytes + * have been written or an error occurs. Specifically, errors EAGAIN, + * EWOULDBLOCK, and EINTR are not considered errors and instead trigger + * another attempt. A delay between attempts is given by + * DELAY_BETWEEN_SOCKET_RETRIES. * @param socket The socket ID. * @param num_bytes The number of bytes to write. * @param buffer The buffer from which to get the bytes. - * @param mutex If non-NULL, the mutex to unlock before exiting. - * @param format A format string for error messages, followed by any number of - * fields that will be used to fill the format string as in printf, or NULL - * to prevent exit on error. - * @return The number of bytes written, or 0 if an EOF was received, or a negative - * number if an error occurred. + * @return 0 for success, -1 for failure. */ -ssize_t write_to_socket_with_mutex( - int socket, - size_t num_bytes, - unsigned char* buffer, - lf_mutex_t* mutex, - char* format, ...); +int write_to_socket(int socket, size_t num_bytes, unsigned char* buffer); /** - * Write the specified number of bytes to the specified socket from the - * specified buffer. If a disconnect or an EOF occurs during this - * reading, report an error and exit, unless the format string is NULL, - * in which case, report an error and return. This function takes a formatted - * string and additional optional arguments similar to printf(format, ...) - * that is appended to the error messages. - * @param socket The socket ID. + * Write the specified number of bytes to the specified socket using write_to_socket + * and close the socket if an error occurs. If an error occurs, this will change the + * socket ID pointed to by the first argument to -1 and will return -1. + * @param socket Pointer to the socket ID. + * @param num_bytes The number of bytes to write. + * @param buffer The buffer from which to get the bytes. + * @return 0 for success, -1 for failure. + */ +int write_to_socket_close_on_error(int* socket, size_t num_bytes, unsigned char* buffer); + +/** + * Write the specified number of bytes to the specified socket using + * write_to_socket_close_on_error and exit with an error code if an error occurs. + * If the mutex argument is non-NULL, release the mutex before exiting. If the + * format argument is non-null, then use it an any additional arguments to form + * the error message using printf conventions. Otherwise, print a generic error + * message. + * @param socket Pointer to the socket ID. * @param num_bytes The number of bytes to write. * @param buffer The buffer from which to get the bytes. * @param mutex If non-NULL, the mutex to unlock before exiting. * @param format A format string for error messages, followed by any number of * fields that will be used to fill the format string as in printf, or NULL - * to prevent exit on error. - * @return The number of bytes written, or 0 if an EOF was received, or a negative - * number if an error occurred. + * to print a generic error message. */ -ssize_t write_to_socket_errexit( - int socket, +void write_to_socket_fail_on_error( + int* socket, size_t num_bytes, unsigned char* buffer, + lf_mutex_t* mutex, char* format, ...); -/** - * Write the specified number of bytes to the specified socket from the - * specified buffer. If a disconnect or an EOF occurs during this - * reading, return a negative number or 0 respectively. Otherwise, - * return the number of bytes written. - * This is a version of write_to_socket() that does not error out. - * @param socket The socket ID. - * @param num_bytes The number of bytes to write. - * @param buffer The buffer from which to get the bytes. - * @return The number of bytes written, or 0 if an EOF was received, or a negative - * number if an error occurred. - */ -int write_to_socket2(int socket, int num_bytes, unsigned char* buffer); - #endif // FEDERATED /** @@ -332,7 +350,7 @@ void encode_tag( ); /** - * A helper struct for passing rti_addr information between parse_rti_addr and extract_rti_addr_info + * A helper struct for passing rti_addr information between lf_parse_rti_addr and extract_rti_addr_info */ typedef struct rti_addr_info_t { char rti_host_str[256]; diff --git a/include/core/lf_types.h b/include/core/lf_types.h index 9cbcea9b3..eb626658e 100644 --- a/include/core/lf_types.h +++ b/include/core/lf_types.h @@ -239,7 +239,7 @@ struct trigger_t { interval_t offset; // Minimum delay of an action. For a timer, this is also the maximum delay. interval_t period; // Minimum interarrival time of an action. For a timer, this is also the maximal interarrival time. bool is_physical; // Indicator that this denotes a physical action. - instant_t last_time; // Time of the last event that was scheduled for this action. + tag_t last_tag; // Tag of the last event that was scheduled for this action. // This is only used for actions and will otherwise be NEVER. lf_spacing_policy_t policy; // Indicates which policy to use when an event is scheduled too early. port_status_t status; // Determines the status of the port at the current logical time. Therefore, this diff --git a/include/core/reactor.h b/include/core/reactor.h index d9ee515b5..9d36c6627 100644 --- a/include/core/reactor.h +++ b/include/core/reactor.h @@ -60,17 +60,21 @@ #define CONSTRUCTOR(classname) (new_ ## classname) #define SELF_STRUCT_T(classname) (classname ## _self_t) -//////////////////////////////////////////////////////////// -//// Macros for producing outputs. - -// NOTE: According to the "Swallowing the Semicolon" section on this page: -// https://gcc.gnu.org/onlinedocs/gcc-3.0.1/cpp_3.html -// the following macros should use an odd do-while construct to avoid -// problems with if ... else statements that do not use braces around the -// two branches. - -// Declarations for functions used by the macros. +/** + * Unless the "fast" option is given, an LF program will wait until + * physical time matches logical time before handling an event with + * a given logical time. The amount of time is less than this given + * threshold, then no wait will occur. The purpose of this is + * to prevent unnecessary delays caused by simply setting up and + * performing the wait. + */ +#define MIN_SLEEP_DURATION USEC(10) +/** + * Print an event from the event queue. + * This is a function of type pqueue_print_entry_f. + */ +void _lf_print_event(void* event); /** * Mark the given port's is_present field as true. This is_present field * will later be cleaned up by _lf_start_time_step. @@ -555,9 +559,6 @@ trigger_handle_t _lf_schedule_value(lf_action_base_t* action, interval_t extra_d */ trigger_handle_t _lf_schedule_copy(lf_action_base_t* action, interval_t offset, void* value, size_t length); -// See reactor.h for doc. -int _lf_fd_send_stop_request_to_rti(tag_t stop_tag); - /** * @brief Will create and initialize the required number of environments for the program * @note Will be code generated by the compiler diff --git a/include/core/reactor_common.h b/include/core/reactor_common.h index be74165b7..29fb73c58 100644 --- a/include/core/reactor_common.h +++ b/include/core/reactor_common.h @@ -15,28 +15,15 @@ extern unsigned int _lf_number_of_workers; extern bool fast; extern instant_t duration; -extern bool _lf_execution_started; extern bool keepalive_specified; extern interval_t _lf_fed_STA_offset; +/** Flag used to disable cleanup operations on normal termination. */ +extern bool _lf_normal_termination; + extern int default_argc; extern const char** default_argv; -#ifdef FEDERATED -void reset_status_fields_on_input_port_triggers(); -port_status_t determine_port_status_if_possible(int portID); -typedef enum parse_rti_code_t { - SUCCESS, - INVALID_PORT, - INVALID_HOST, - INVALID_USER, - FAILED_TO_PARSE -} parse_rti_code_t; -parse_rti_code_t parse_rti_addr(const char* rti_addr); -void set_federation_id(const char* fid); -void set_federation_trace_object(trace_t * trace); -#endif - extern struct allocation_record_t* _lf_reactors_to_free; void* _lf_new_reactor(size_t size); void _lf_free(struct allocation_record_t** head); @@ -61,7 +48,7 @@ event_t* _lf_create_dummy_events( event_t* next, microstep_t offset ); -int _lf_schedule_at_tag(environment_t* env, trigger_t* trigger, tag_t tag, lf_token_t* token); +trigger_handle_t _lf_schedule_at_tag(environment_t* env, trigger_t* trigger, tag_t tag, lf_token_t* token); trigger_handle_t _lf_schedule(environment_t* env, trigger_t* trigger, interval_t extra_delay, lf_token_t* token); trigger_handle_t _lf_insert_reactions_for_trigger(environment_t* env, trigger_t* trigger, lf_token_t* token); diff --git a/include/core/tag.h b/include/core/tag.h index 0a480a40b..e38ea7de5 100644 --- a/include/core/tag.h +++ b/include/core/tag.h @@ -88,6 +88,11 @@ tag_t lf_tag(void* env); * return NEVER_TAG or FOREVER_TAG, respectively. Also return NEVER_TAG or FOREVER_TAG * if the result underflows or overflows when adding the times. * If the microstep overflows, also return FOREVER_TAG. + * If the time field of the second tag is greater than 0, then the microstep of the first tag + * is reset to 0 before adding. This models the delay semantics in LF and makes this + * addition operation non-commutative. + * @param a The first tag. + * @param b The second tag. */ tag_t lf_tag_add(tag_t a, tag_t b); diff --git a/include/core/threaded/reactor_threaded.h b/include/core/threaded/reactor_threaded.h index 0053112d0..f0f3d424b 100644 --- a/include/core/threaded/reactor_threaded.h +++ b/include/core/threaded/reactor_threaded.h @@ -17,7 +17,7 @@ void try_advance_level(environment_t* env, volatile size_t* next_reaction_level) * message to downstream federates if a given network output port is not present. * @param env The environment in which we are executing */ -void enqueue_port_absent_reactions(environment_t* env); +void lf_enqueue_port_absent_reactions(environment_t* env); /** * Raise a barrier to prevent the current tag for the specified environment from advancing @@ -79,7 +79,7 @@ void _lf_increment_tag_barrier_locked(environment_t *env, tag_t future_tag); void _lf_decrement_tag_barrier_locked(environment_t* env); int _lf_wait_on_tag_barrier(environment_t* env, tag_t proposed_tag); -void synchronize_with_other_federates(void); +void lf_synchronize_with_other_federates(void); bool wait_until(environment_t* env, instant_t logical_time_ns, lf_cond_t* condition); tag_t get_next_event_tag(environment_t* env); tag_t send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply); diff --git a/include/core/trace.h b/include/core/trace.h index 598c669bf..d88abc291 100644 --- a/include/core/trace.h +++ b/include/core/trace.h @@ -76,9 +76,10 @@ typedef enum worker_wait_ends, scheduler_advancing_time_starts, scheduler_advancing_time_ends, - federated, // Everything above this is tracing federated interactions. + federated, // Everything below this is for tracing federated interactions. // Sending messages send_ACK, + send_FAILED, send_TIMESTAMP, send_NET, send_LTC, @@ -100,6 +101,7 @@ typedef enum send_ADR_QR, // Receiving messages receive_ACK, + receive_FAILED, receive_TIMESTAMP, receive_NET, receive_LTC, @@ -142,6 +144,7 @@ static const char *trace_event_names[] = { "Federated marker", // Sending messages "Sending ACK", + "Sending FAILED", "Sending TIMESTAMP", "Sending NET", "Sending LTC", @@ -163,6 +166,7 @@ static const char *trace_event_names[] = { "Sending ADR_QR", // Receiving messages "Receiving ACK", + "Receiving FAILED", "Receiving TIMESTAMP", "Receiving NET", "Receiving LTC", @@ -435,6 +439,10 @@ void tracepoint_reaction_deadline_missed(trace_t* trace, reaction_t *reaction, i * close the files. */ void stop_trace(trace_t* trace); + +/** + * Version of stop_trace() that does not lock the trace mutex. + */ void stop_trace_locked(trace_t* trace); //////////////////////////////////////////////////////////// diff --git a/include/core/utils/pqueue_tag.h b/include/core/utils/pqueue_tag.h index aef72d507..ad4ac84d1 100644 --- a/include/core/utils/pqueue_tag.h +++ b/include/core/utils/pqueue_tag.h @@ -35,7 +35,7 @@ * pqueue_tag_insert_tag, pqueue_tag_insert_if_no_match, and pqueue_tag_pop_tag. * * To customize the element you put onto the queue, for example to carry - * a pyaload, you can create your own element struct type by simply declaring + * a payload, you can create your own element struct type by simply declaring * the first field to be a pqueue_tag_element_t. For example, if you want an * element of the queue to include a pointer to your own payload, you can * declare the following struct type: @@ -56,12 +56,13 @@ typedef struct { } pqueue_tag_element_t; /** - * Type of a priority queue sorted by tags. + * @brief Type of a priority queue sorted by tags. */ typedef pqueue_t pqueue_tag_t; /** * @brief Create a priority queue sorted by tags. + * * The elements of the priority queue will be of type pqueue_tag_element_t. * The caller should call pqueue_tag_free() when finished with the queue. * @return A dynamically allocated priority queue or NULL if memory allocation fails. @@ -69,19 +70,22 @@ typedef pqueue_t pqueue_tag_t; pqueue_tag_t* pqueue_tag_init(size_t initial_size); /** - * Free all memory used by the queue including any elements that are marked is_dynamic. + * @brief Free all memory used by the queue including elements that are marked dynamic. + * * @param q The queue. */ void pqueue_tag_free(pqueue_tag_t *q); /** - * Return the size of the queue. + * @brief Return the size of the queue. + * * @param q The queue. */ size_t pqueue_tag_size(pqueue_tag_t *q); /** - * Insert an element into the queue. + * @brief Insert an element into the queue. + * * @param q The queue. * @param e The element to insert. * @return 0 on success @@ -90,6 +94,7 @@ int pqueue_tag_insert(pqueue_tag_t* q, pqueue_tag_element_t* d); /** * @brief Insert a tag into the queue. + * * This automatically creates a dynamically allocated element in the queue * and ensures that if the element is still on the queue when pqueue_tag_free * is called, then that memory will be freed. @@ -101,6 +106,7 @@ int pqueue_tag_insert_tag(pqueue_tag_t* q, tag_t t); /** * @brief Insert a tag into the queue if the tag is not already in the queue. + * * This automatically creates a dynamically allocated element in the queue * and ensures that if the element is still on the queue when pqueue_tag_free * is called, then that memory will be freed. @@ -111,16 +117,30 @@ int pqueue_tag_insert_tag(pqueue_tag_t* q, tag_t t); int pqueue_tag_insert_if_no_match(pqueue_tag_t* q, tag_t t); /** - * @brief Pop the least-tag element from the queue and return its tag. - * If the queue is empty, return FOREVER_TAG. This function handles freeing - * the element struct if it was dynamically allocated. + * @brief Return the first item with the specified tag or NULL if there is none. * @param q The queue. - * @return NULL on error, otherwise the entry + * @param t The tag. + * @return An entry with the specified tag or NULL if there isn't one. */ -tag_t pqueue_tag_pop_tag(pqueue_tag_t* q); +pqueue_tag_element_t* pqueue_tag_find_with_tag(pqueue_tag_t *q, tag_t t); + +/** + * @brief Return highest-ranking item (the one with the least tag) without removing it. + * @param q The queue. + * @return NULL on if the queue is empty, otherwise the entry. + */ +pqueue_tag_element_t* pqueue_tag_peek(pqueue_tag_t* q); + +/** + * @brief Return the least tag in the queue or FOREVER if the queue is empty. + * @param q The queue. + * @return The least tag in the queue or FOREVER if the queue is empty. + */ +tag_t pqueue_tag_peek_tag(pqueue_tag_t* q); /** * @brief Pop the least-tag element from the queue. + * * If the entry was dynamically allocated, then it is now up to the caller * to ensure that it is freed. It will not be freed by pqueue_tag_free. * @param q The queue. @@ -129,26 +149,30 @@ tag_t pqueue_tag_pop_tag(pqueue_tag_t* q); pqueue_tag_element_t* pqueue_tag_pop(pqueue_tag_t* q); /** - * Return the first item with the specified tag or NULL if there is none. + * @brief Pop the least-tag element from the queue and return its tag. + * + * If the queue is empty, return FOREVER_TAG. This function handles freeing + * the element struct if it was dynamically allocated. * @param q The queue. - * @param t The tag. - * @return An entry with the specified tag or NULL if there isn't one. + * @return NULL on error, otherwise the entry */ -pqueue_tag_element_t* pqueue_tag_find_with_tag(pqueue_tag_t *q, tag_t t); +tag_t pqueue_tag_pop_tag(pqueue_tag_t* q); /** - * Remove an item from the queue. + * @brief Remove an item from the queue. + * * @param q The queue. * @param e The entry to remove. - * @return 0 on success */ -int pqueue_tag_remove(pqueue_tag_t* q, pqueue_tag_element_t* e); +void pqueue_tag_remove(pqueue_tag_t* q, pqueue_tag_element_t* e); /** - * Access highest-ranking item without removing it. + * @brief Remove items from the queue with tags up to and including the specified tag. + * + * If the specified tag is FOREVER_TAG, then all items will be removed. * @param q The queue. - * @return NULL on error, otherwise the entry. + * @param t The specified tag. */ -pqueue_tag_element_t* pqueue_tag_peek(pqueue_tag_t* q); +void pqueue_tag_remove_up_to(pqueue_tag_t* q, tag_t t); #endif // PQUEUE_TAG_H diff --git a/include/core/utils/util.h b/include/core/utils/util.h index 42da6c3f5..728880e0f 100644 --- a/include/core/utils/util.h +++ b/include/core/utils/util.h @@ -35,6 +35,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include // Defines va_list #include +#include // Defines int64_t // To silence warnings about a function being a candidate for format checking // with gcc, add an attribute. @@ -50,10 +51,10 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Holds generic statistical data */ typedef struct lf_stat_ll { - long long average; - long long standard_deviation; - long long variance; - long long max; + int64_t average; + int64_t standard_deviation; + int64_t variance; + int64_t max; } lf_stat_ll; /** @@ -238,6 +239,12 @@ void lf_vprint_warning(const char* format, va_list args) ATTRIBUTE_FORMAT_PRINTF */ void lf_print_error_and_exit(const char* format, ...) ATTRIBUTE_FORMAT_PRINTF(1, 2); +/** + * Report an error and exit just like lf_print_error_and_exit(), but + * also print the system error message associated with the error. + */ +void lf_print_error_system_failure(const char* format, ...); + /** * varargs alternative of "lf_print_error_and_exit" */ @@ -267,12 +274,16 @@ typedef void(print_message_function_t)(const char*, va_list); void lf_register_print_function(print_message_function_t* function, int log_level); /** - * Assertion handling. LF_ASSERT can be used as a short hand for verifying + * Assertion handling. LF_ASSERT can be used as a shorthand for verifying * a condition and calling `lf_print_error_and_exit` if it is not true. - * This is optimized away if the NDEBUG flag is defined. + * The LF_ASSERT version requires that the condition evaluate to true + * (non-zero), whereas the LF_ASSERTN version requires that the condition + * evaluate to false (zero). + * These are optimized away if the NDEBUG flag is defined. */ #if defined(NDEBUG) #define LF_ASSERT(condition, format, ...) (void)(condition) +#define LF_ASSERTN(condition, format, ...) (void)(condition) #else #define LF_ASSERT(condition, format, ...) \ do { \ @@ -280,5 +291,24 @@ void lf_register_print_function(print_message_function_t* function, int log_leve lf_print_error_and_exit(format, ##__VA_ARGS__); \ } \ } while(0) +#define LF_ASSERTN(condition, format, ...) \ + do { \ + if (condition) { \ + lf_print_error_and_exit(format, ##__VA_ARGS__); \ + } \ + } while(0) #endif // NDEBUG + +/** + * Checking mutex locking and unlocking. + * This is optimized away if the NDEBUG flag is defined. + */ +#define LF_MUTEX_INIT(mutex) LF_ASSERTN(lf_mutex_init(&mutex), "Mutex init failed.") + +#define LF_MUTEX_LOCK(mutex) LF_ASSERTN(lf_mutex_lock(&mutex), "Mutex lock failed.") + +#define LF_MUTEX_UNLOCK(mutex) LF_ASSERTN(lf_mutex_unlock(&mutex), "Mutex unlock failed.") + +#define LF_COND_INIT(cond, mutex) LF_ASSERTN(lf_cond_init(&cond, &mutex), "Condition variable init failed.") + #endif /* UTIL_H */ diff --git a/lingua-franca-ref.txt b/lingua-franca-ref.txt index 1f7391f92..e47bc9016 100644 --- a/lingua-franca-ref.txt +++ b/lingua-franca-ref.txt @@ -1 +1 @@ -master +federated-cleanup diff --git a/test/general/utils/pqueue_test.c b/test/general/utils/pqueue_test.c index f95492799..e0f252c7d 100644 --- a/test/general/utils/pqueue_test.c +++ b/test/general/utils/pqueue_test.c @@ -79,7 +79,7 @@ static void pop_empty(pqueue_tag_t* q) { static void remove_from_queue(pqueue_tag_t* q, pqueue_tag_element_t* e1, pqueue_tag_element_t* e2) { assert(pqueue_tag_insert(q, e1) == 0); assert(pqueue_tag_insert(q, e2) == 0); - assert(pqueue_tag_remove(q, e1) == 0); + pqueue_tag_remove(q, e1); assert(pqueue_tag_peek(q) == e2); assert(pqueue_tag_size(q) == 1); } diff --git a/util/tracing/Makefile b/util/tracing/Makefile index b20292d00..15fd0c13e 100644 --- a/util/tracing/Makefile +++ b/util/tracing/Makefile @@ -37,4 +37,4 @@ install: trace_to_csv trace_to_chrome trace_to_influxdb chmod +x $(BIN_INSTALL_PATH)/fedsd clean: - rm -f *.o + rm -f *.o trace_to_chrome trace_to_influxdb trace_to_csv diff --git a/util/tracing/visualization/fedsd.py b/util/tracing/visualization/fedsd.py index ed4691bc2..b35e96dd7 100644 --- a/util/tracing/visualization/fedsd.py +++ b/util/tracing/visualization/fedsd.py @@ -41,6 +41,7 @@ # communication rendering prune_event_name = { "Sending ACK": "ACK", + "Sending FAILED": "FAILED", "Sending TIMESTAMP": "TIMESTAMP", "Sending NET": "NET", "Sending LTC": "LTC", @@ -61,6 +62,7 @@ "Sending ADR_AD": "ADR_AD", "Sending ADR_QR": "ADR_QR", "Receiving ACK": "ACK", + "Receiving FAILED": "FAILED", "Receiving TIMESTAMP": "TIMESTAMP", "Receiving NET": "NET", "Receiving LTC": "LTC", @@ -108,7 +110,7 @@ # Events matching at the sender and receiver ends depend on whether they are tagged # (the elapsed logical time and microstep have to be the same) or not. # Set of tagged events (messages) -non_tagged_messages = {'FED_ID', 'ACK', 'REJECT', 'ADR_RQ', 'ADR_AD', 'MSG', 'P2P_MSG'} +non_tagged_messages = {'FED_ID', 'ACK', 'RESIGN', 'FAILED', 'REJECT', 'ADR_RQ', 'ADR_AD', 'MSG', 'P2P_MSG'} ################################################################################ @@ -670,7 +672,7 @@ def get_and_convert_lft_files(rti_lft_file, federates_lft_files, start_time, end # FIXME: Using microseconds is hardwired here. physical_time = f'{int(row["physical_time"]/1000):,}' - if (row['event'] in {'FED_ID', 'ACK', 'REJECT', 'ADR_RQ', 'ADR_AD', 'MSG', 'P2P_MSG'}): + if (row['event'] in {'FED_ID', 'ACK', 'FAILED', 'REJECT', 'ADR_RQ', 'ADR_AD', 'MSG', 'P2P_MSG'}): label = row['event'] else: label = row['event'] + '(' + f'{int(row["logical_time"]):,}' + ', ' + str(row['microstep']) + ')'