diff --git a/rs/p2p/quic_transport/README.adoc b/rs/p2p/quic_transport/README.adoc index 7de146705ab..de0ba43ef09 100644 --- a/rs/p2p/quic_transport/README.adoc +++ b/rs/p2p/quic_transport/README.adoc @@ -8,27 +8,29 @@ The IC Transport layer enables message delivery between nodes within a subnet. == Requirements == +A scalable, RPC-like library designed to operate in a peer-to-peer (P2P) network with Byzantine peers. + [cols="3,3,3"] |=== | **Capability** | **Purpose** | **Implementation Requirement** -| RPC-like library -| Abstracting one-to-one communication between nodes, allowing developers to focus on application logic instead of low-level networking. -| Ensure https://en.wikipedia.org/wiki/Reliability_(computer_networking)[reliable data delivery] so the sender can confirm successful message delivery to the recipient. +| Request–response abstraction +| Simplifies one-to-one communication between nodes, enabling developers to focus on application logic instead of low-level networking. +| Ensure https://en.wikipedia.org/wiki/Reliability_(computer_networking)[reliable data delivery] so the sender receives confirmation of successful message delivery. -| Multitenancy of applications -| Enables multiple applications to safely share the library, ensuring isolation and uninterrupted operation across applications. -| Support https://en.wikipedia.org/wiki/Multiplexing[multiplexing] to independently route messages to their endpoints, unaffected by packet losses or delays impacting other messages. +| Scalable and resource-efficient +| Optimizes shared resource usage (e.g., CPU, file descriptors) to sustain high performance under load and prevent resource exhaustion. +| Use https://en.wikipedia.org/wiki/Connection-oriented_communication[connection-oriented communication] to maintain a single persistent connection, minimizing overhead from repeated TLS handshakes or excessive connections. -| Operates in P2P networks with Byzantine peers -| Supports deployment in real-world environments, including networks with faulty or malicious nodes. -| Implement https://en.wikipedia.org/wiki/Flow_control_(data)[flow control] to manage the number of connections, inflight messages, and streams per connection, ensuring stability and resilience. +| Multitenancy of endpoints/handlers +| Supports independent processing of multiple concurrent requests and responses. +| Implement https://en.wikipedia.org/wiki/Multiplexing[multiplexing] to route messages independently, ensuring that packet loss or delay does not affect unrelated messages. -| Scalable and resource-efficient -| Optimizes usage of shared resources (e.g., CPU, file descriptors) to maintain performance under high demand and prevent resource exhaustion. -| Utilize https://en.wikipedia.org/wiki/Connection-oriented_communication[connection-oriented communication] (e.g., a single persistent connection) to minimize overhead from repeated TLS handshakes or excessive connections between peers. +| P2P fairness and resource protection +| Ensures fair resource allocation in a P2P network where all peers are equal, preventing resource exhaustion by any single peer. +| Enforce https://en.wikipedia.org/wiki/Flow_control_(data)[flow control] to limit the number of connections and in-flight messages per peer, ensuring resource usage stays within allocated boundaries. |=== diff --git a/rs/p2p/quic_transport/src/metrics.rs b/rs/p2p/quic_transport/src/metrics.rs index 9c3097001be..fc10956486f 100644 --- a/rs/p2p/quic_transport/src/metrics.rs +++ b/rs/p2p/quic_transport/src/metrics.rs @@ -9,10 +9,9 @@ use tokio_metrics::TaskMonitor; const CONNECTION_RESULT_LABEL: &str = "status"; const PEER_ID_LABEL: &str = "peer"; const REQUEST_TASK_MONITOR_NAME: &str = "quic_transport_request_handler"; -const STREAM_TYPE_LABEL: &str = "stream"; const HANDLER_LABEL: &str = "handler"; const ERROR_TYPE_LABEL: &str = "error"; -const REQUEST_TYPE_LABEL: &str = "request"; +const QUINN_API_LABEL: &str = "quinn_api"; pub(crate) const CONNECTION_RESULT_SUCCESS_LABEL: &str = "success"; pub(crate) const CONNECTION_RESULT_FAILED_LABEL: &str = "failed"; pub(crate) const ERROR_TYPE_APP: &str = "app"; @@ -22,6 +21,7 @@ const ERROR_RESET_STREAM: &str = "reset_stream"; const ERROR_STOPPED_STREAM: &str = "stopped_stream"; const ERROR_APP_CLOSED_CONN: &str = "app_closed_conn"; const ERROR_TIMED_OUT_CONN: &str = "timed_out_conn"; +const ERROR_RESET_CONN: &str = "timed_reset_conn"; const ERROR_TRANSPORT_ERROR: &str = "transport_error_conn"; const ERROR_LOCALLY_CLOSED_CONN: &str = "locally_closed_conn"; @@ -116,7 +116,7 @@ impl QuicTransportMetrics { request_handle_errors_total: metrics_registry.int_counter_vec( "quic_transport_request_handle_errors_total", "Request handler errors by stream type and error type.", - &[STREAM_TYPE_LABEL, ERROR_TYPE_LABEL], + &[QUINN_API_LABEL, ERROR_TYPE_LABEL], ), request_handle_bytes_received_total: metrics_registry.int_counter_vec( "quic_transport_request_handle_bytes_received_total", @@ -154,9 +154,8 @@ impl QuicTransportMetrics { connection_handle_errors_total: metrics_registry.int_counter_vec( "quic_transport_connection_handle_errors_total", "Request handler errors by stream type and error type.", - &[REQUEST_TYPE_LABEL, ERROR_TYPE_LABEL], + &[QUINN_API_LABEL, ERROR_TYPE_LABEL], ), - // Quinn stats quinn_path_rtt_seconds: metrics_registry.gauge_vec( "quic_transport_quinn_path_rtt_seconds", @@ -215,19 +214,25 @@ pub fn observe_conn_error(err: &ConnectionError, op: &str, counter: &IntCounterV .inc(), // This can occur if the peer crashes or experiences connectivity issues. ConnectionError::TimedOut => counter.with_label_values(&[op, ERROR_TIMED_OUT_CONN]).inc(), - // This should be made infallible. + // TODO: This should be made infallible. It is unclear why we observe those errors. + // It is similar to a TimedOut error, but the key difference is that TimedOut + // usually indicates a failure during data transmission. + ConnectionError::Reset => counter.with_label_values(&[op, ERROR_RESET_CONN]).inc(), + // TODO: This should be made infallible. It is unclear why we observe those errors. ConnectionError::TransportError(_) => counter .with_label_values(&[op, ERROR_TRANSPORT_ERROR]) .inc(), // A connection was closed by the QUIC protocol. Overall should be infallible. - _ => counter.with_label_values(&[op, INFALIBBLE]).inc(), + ConnectionError::VersionMismatch + | ConnectionError::ConnectionClosed(_) + | ConnectionError::CidsExhausted => counter.with_label_values(&[op, INFALIBBLE]).inc(), } } pub fn observe_write_error(err: &WriteError, op: &str, counter: &IntCounterVec) { match err { - // Occurs when the peer cancels the `RecvStream` future, similar to `ERROR_RESET_STREAM` semantics, - // e.g., when the RPC method is part of a `select` branch. + // Occurs when the peer cancels the `RecvStream` future, similar to `ERROR_RESET_STREAM` semantics. + // e.g., can happen on the receive side when the RPC method is part of a `select` branch. WriteError::Stopped(_) => counter.with_label_values(&[op, ERROR_STOPPED_STREAM]).inc(), WriteError::ConnectionLost(conn_err) => observe_conn_error(conn_err, op, counter), // If any of the following errors occur it means that we have a bug in the protocol implementation or @@ -240,8 +245,8 @@ pub fn observe_write_error(err: &WriteError, op: &str, counter: &IntCounterVec) pub fn observe_read_error(err: &ReadError, op: &str, counter: &IntCounterVec) { match err { - // Occurs when the peer cancels the `SendStream` future, similar to `ERROR_STOPPED_STREAM` semantics, - // e.g., when the RPC method is part of a `select` branch. + // Occurs when the peer drops the `ResetStreamOnDrop` guard, similar to `ERROR_STOPPED_STREAM` semantics, + // e.g., can happen on the receive side when the RPC method is part of a `select` branch. ReadError::Reset(_) => counter.with_label_values(&[op, ERROR_RESET_STREAM]).inc(), ReadError::ConnectionLost(conn_err) => observe_conn_error(conn_err, op, counter), // If any of the following errors occur it means that we have a bug in the protocol implementation or