-
Notifications
You must be signed in to change notification settings - Fork 33
transport: Make accept async to close the gap on service races #525
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
99800ab
8481c77
cf22d56
4b10f8b
2df6ce7
7061b08
51a94c3
6c23586
517168a
fe1a448
f06a71e
957b139
78b17d2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,7 +39,7 @@ use crate::{ | |
| }; | ||
|
|
||
| use address::{scores, AddressStore}; | ||
| use futures::{Stream, StreamExt}; | ||
| use futures::{future::BoxFuture, stream::FuturesUnordered, Stream, StreamExt}; | ||
| use indexmap::IndexMap; | ||
| use multiaddr::{Multiaddr, Protocol}; | ||
| use multihash::Multihash; | ||
|
|
@@ -249,6 +249,9 @@ pub struct TransportManager { | |
|
|
||
| /// Opening connections errors. | ||
| opening_errors: HashMap<ConnectionId, Vec<(Multiaddr, DialError)>>, | ||
|
|
||
| /// Pending accept futures with associated connection information. | ||
| pending_accept: FuturesUnordered<BoxFuture<'static, (PeerId, Endpoint, crate::Result<()>)>>, | ||
| } | ||
|
|
||
| /// Builder for [`crate::transport::manager::TransportManager`]. | ||
|
|
@@ -351,6 +354,7 @@ impl TransportManagerBuilder { | |
| pending_connections: HashMap::new(), | ||
| connection_limits: limits::ConnectionLimits::new(self.connection_limits_config), | ||
| opening_errors: HashMap::new(), | ||
| pending_accept: FuturesUnordered::new(), | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -1076,6 +1080,35 @@ impl TransportManager { | |
| pub async fn next(&mut self) -> Option<TransportEvent> { | ||
| loop { | ||
| tokio::select! { | ||
| (peer, endpoint, result) = self.pending_accept.select_next_some(), if !self.pending_accept.is_empty() => { | ||
| match result { | ||
| Ok(()) => { | ||
| tracing::trace!( | ||
| target: LOG_TARGET, | ||
| ?peer, | ||
| ?endpoint, | ||
| "connection accepted and protocols notified", | ||
| ); | ||
|
|
||
| return Some(TransportEvent::ConnectionEstablished { peer, endpoint }); | ||
| } | ||
| Err(error) => { | ||
| // The pending accept future has failed to inform one of the | ||
| // installed protocols about the connection. This can happen when the | ||
| // node is shutting down or when the user has dropped the long running protocol. | ||
| // To err on the safe side, roll back the state modification done in `on_connection_established`. | ||
| self.on_connection_closed(peer, endpoint.connection_id()); | ||
|
|
||
| tracing::error!( | ||
| target: LOG_TARGET, | ||
| ?peer, | ||
| ?endpoint, | ||
| ?error, | ||
| "failed to notify protocols about connection", | ||
| ); | ||
| } | ||
| } | ||
| } | ||
| event = self.event_rx.recv() => { | ||
| let Some(event) = event else { | ||
| tracing::error!( | ||
|
|
@@ -1255,16 +1288,36 @@ impl TransportManager { | |
| "accept connection", | ||
| ); | ||
|
|
||
| let _ = self | ||
| match self | ||
| .transports | ||
| .get_mut(&transport) | ||
| .expect("transport to exist") | ||
| .accept(endpoint.connection_id()); | ||
| .accept(endpoint.connection_id()) | ||
| { | ||
| Ok(future) => { | ||
| // A ConnectionEstablished is propagated to the user once | ||
| // all protocols have been notified. | ||
| self.pending_accept.push(Box::pin(async move { | ||
| let result = future.await; | ||
| (peer, endpoint, result) | ||
| })); | ||
| } | ||
| Err(error) => { | ||
| // Roll back the state modification done in `on_connection_established` by | ||
| // simulating a closed connection. The transport returns an error | ||
| // while accepting the connection, which can happen if the transport is | ||
| // already closed or the connection is dropped before the accept call. | ||
| self.on_connection_closed(peer, endpoint.connection_id()); | ||
|
|
||
| return Some(TransportEvent::ConnectionEstablished { | ||
| peer, | ||
| endpoint, | ||
| }); | ||
| tracing::debug!( | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't we need to clean up some data in this case? Accept will not run at all when we arrive here, but we already modified some state in |
||
| target: LOG_TARGET, | ||
| ?peer, | ||
| ?endpoint, | ||
| ?error, | ||
| "failed to accept connection", | ||
| ); | ||
| } | ||
| } | ||
| } | ||
| Ok(ConnectionEstablishedResult::Reject) => { | ||
| tracing::trace!( | ||
|
|
@@ -1459,8 +1512,11 @@ mod tests { | |
| Ok(()) | ||
| } | ||
|
|
||
| fn accept(&mut self, _connection_id: ConnectionId) -> crate::Result<()> { | ||
| Ok(()) | ||
| fn accept( | ||
| &mut self, | ||
| _connection_id: ConnectionId, | ||
| ) -> crate::Result<BoxFuture<'static, crate::Result<()>>> { | ||
| Ok(Box::pin(async { Ok(()) })) | ||
| } | ||
|
|
||
| fn accept_pending(&mut self, _connection_id: ConnectionId) -> crate::Result<()> { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -307,35 +307,48 @@ impl Transport for QuicTransport { | |
| Ok(()) | ||
| } | ||
|
|
||
| fn accept(&mut self, connection_id: ConnectionId) -> crate::Result<()> { | ||
| fn accept( | ||
| &mut self, | ||
| connection_id: ConnectionId, | ||
| ) -> crate::Result<BoxFuture<'static, crate::Result<()>>> { | ||
| let (connection, endpoint) = self | ||
| .pending_open | ||
| .remove(&connection_id) | ||
| .ok_or(Error::ConnectionDoesntExist(connection_id))?; | ||
| let bandwidth_sink = self.context.bandwidth_sink.clone(); | ||
| let protocol_set = self.context.protocol_set(connection_id); | ||
| let mut protocol_set = self.context.protocol_set(connection_id); | ||
| let substream_open_timeout = self.config.substream_open_timeout; | ||
| let executor = self.context.executor.clone(); | ||
|
|
||
| tracing::trace!( | ||
| target: LOG_TARGET, | ||
| ?connection_id, | ||
| "start connection", | ||
| ); | ||
|
|
||
| self.context.executor.run(Box::pin(async move { | ||
| let _ = QuicConnection::new( | ||
| connection.peer, | ||
| endpoint, | ||
| connection.connection, | ||
| protocol_set, | ||
| bandwidth_sink, | ||
| substream_open_timeout, | ||
| ) | ||
| .start() | ||
| .await; | ||
| })); | ||
| let peer = connection.peer; | ||
| let endpoint_clone = endpoint.clone(); | ||
|
|
||
| Ok(Box::pin(async move { | ||
| // First, notify all protocols about the connection establishment | ||
| protocol_set.report_connection_established(peer, endpoint_clone).await?; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
May be we need to modify
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep this could be a great followup. Would like to keep this separate as we'll need to revalidate if this adds any delays to connection / deadlocks if one single protocol is under pressure during high loads. 🙏 |
||
|
|
||
| // After protocols are notified, spawn the connection event loop | ||
| executor.run(Box::pin(async move { | ||
| let _ = QuicConnection::new( | ||
| peer, | ||
| endpoint, | ||
| connection.connection, | ||
| protocol_set, | ||
| bandwidth_sink, | ||
| substream_open_timeout, | ||
| ) | ||
| .start() | ||
| .await; | ||
| })); | ||
|
|
||
| Ok(()) | ||
| Ok(()) | ||
| })) | ||
| } | ||
|
|
||
| fn reject(&mut self, connection_id: ConnectionId) -> crate::Result<()> { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same as the other comment, if we fail here we need to clean up some data like this here:
litep2p/src/transport/manager/mod.rs
Line 899 in fe1a448