From 121979213f2007b24043f195f1fa37ff3bc218b9 Mon Sep 17 00:00:00 2001 From: Boris Zbarsky Date: Thu, 9 Mar 2023 23:58:40 -0500 Subject: [PATCH] When retrying CASE during commissioning, extend the fail-safe. (#25595) If we don't do this, we can get into a situation where our retries take longer than the fail-safe timer, so we are still retrying by the other side is not even listening anymore. The changes here are as follows: 1) Expose various bits on CASEClient and CASESession to allow us to compute how long it will take before we know whether the next CASE retry has succeeded or failed. 2) Add a way for OperationalSessionSetup to notify its consumers that it's going to retry, and how long it expects it to take before it knows whether the retry has succeeded. 3) Change DeviceCommissioner to extend the fail-safe when it's told that OperationalSessionSetup will retry. --- src/app/CASEClient.cpp | 5 ++ src/app/CASEClient.h | 2 + src/app/CASESessionManager.cpp | 6 +- src/app/CASESessionManager.h | 2 +- src/app/OperationalSessionSetup.cpp | 55 +++++++++++++++--- src/app/OperationalSessionSetup.h | 31 +++++++++- src/controller/CHIPDeviceController.cpp | 59 +++++++++++++++++++- src/controller/CHIPDeviceController.h | 8 +++ src/protocols/secure_channel/CASESession.cpp | 16 +++++- src/protocols/secure_channel/CASESession.h | 6 ++ 10 files changed, 175 insertions(+), 15 deletions(-) diff --git a/src/app/CASEClient.cpp b/src/app/CASEClient.cpp index f6fa82c686e827..7ce5a539e6105a 100644 --- a/src/app/CASEClient.cpp +++ b/src/app/CASEClient.cpp @@ -24,6 +24,11 @@ void CASEClient::SetRemoteMRPIntervals(const ReliableMessageProtocolConfig & rem mCASESession.SetRemoteMRPConfig(remoteMRPConfig); } +const ReliableMessageProtocolConfig & CASEClient::GetRemoteMRPIntervals() +{ + return mCASESession.GetRemoteMRPConfig(); +} + CHIP_ERROR CASEClient::EstablishSession(const CASEClientInitParams & params, const ScopedNodeId & peer, const Transport::PeerAddress & peerAddress, const ReliableMessageProtocolConfig & remoteMRPConfig, diff --git a/src/app/CASEClient.h b/src/app/CASEClient.h index 3a5aa8ded7ff08..b640ae066f0f09 100644 --- a/src/app/CASEClient.h +++ b/src/app/CASEClient.h @@ -54,6 +54,8 @@ class DLL_EXPORT CASEClient public: void SetRemoteMRPIntervals(const ReliableMessageProtocolConfig & remoteMRPConfig); + const ReliableMessageProtocolConfig & GetRemoteMRPIntervals(); + CHIP_ERROR EstablishSession(const CASEClientInitParams & params, const ScopedNodeId & peer, const Transport::PeerAddress & peerAddress, const ReliableMessageProtocolConfig & remoteMRPConfig, SessionEstablishmentDelegate * delegate); diff --git a/src/app/CASESessionManager.cpp b/src/app/CASESessionManager.cpp index 3fca62d5cc37ed..52c48f102672b3 100644 --- a/src/app/CASESessionManager.cpp +++ b/src/app/CASESessionManager.cpp @@ -33,7 +33,7 @@ void CASESessionManager::FindOrEstablishSession(const ScopedNodeId & peerId, Cal Callback::Callback * onFailure #if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES , - uint8_t attemptCount + uint8_t attemptCount, Callback::Callback * onRetry #endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES ) { @@ -60,6 +60,10 @@ void CASESessionManager::FindOrEstablishSession(const ScopedNodeId & peerId, Cal #if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES session->UpdateAttemptCount(attemptCount); + if (onRetry) + { + session->AddRetryHandler(onRetry); + } #endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES session->Connect(onConnection, onFailure); diff --git a/src/app/CASESessionManager.h b/src/app/CASESessionManager.h index 455ca0b3b36fe1..8b803a0a10f0a2 100644 --- a/src/app/CASESessionManager.h +++ b/src/app/CASESessionManager.h @@ -85,7 +85,7 @@ class CASESessionManager : public OperationalSessionReleaseDelegate, public Sess Callback::Callback * onFailure #if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES , - uint8_t attemptCount = 1 + uint8_t attemptCount = 1, Callback::Callback * onRetry = nullptr #endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES ); diff --git a/src/app/OperationalSessionSetup.cpp b/src/app/OperationalSessionSetup.cpp index a87741ea3a246b..1deeebf8c92f5f 100644 --- a/src/app/OperationalSessionSetup.cpp +++ b/src/app/OperationalSessionSetup.cpp @@ -263,6 +263,15 @@ void OperationalSessionSetup::DequeueConnectionCallbacks(CHIP_ERROR error) mConnectionFailure.DequeueAll(failureReady); mConnectionSuccess.DequeueAll(successReady); +#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + // Clear out mConnectionRetry, so that those cancelables are not holding + // pointers to us, since we're about to go away. + while (auto * cb = mConnectionRetry.First()) + { + cb->Cancel(); + } +#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + // // If we encountered no error, go ahead and call all success callbacks. Otherwise, // call the failure callbacks. @@ -304,13 +313,23 @@ void OperationalSessionSetup::DequeueConnectionCallbacks(CHIP_ERROR error) void OperationalSessionSetup::OnSessionEstablishmentError(CHIP_ERROR error) { - VerifyOrReturn(mState != State::Uninitialized && mState != State::NeedsAddress, - ChipLogError(Discovery, "HandleCASEConnectionFailure was called while the device was not initialized")); + VerifyOrReturn(mState == State::Connecting, + ChipLogError(Discovery, "OnSessionEstablishmentError was called while we were not connecting")); if (CHIP_ERROR_TIMEOUT == error) { +#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + // Make a copy of the ReliableMessageProtocolConfig, since our + // mCaseClient is about to go away. + ReliableMessageProtocolConfig remoteMprConfig = mCASEClient->GetRemoteMRPIntervals(); +#endif + if (CHIP_NO_ERROR == Resolver::Instance().TryNextResult(mAddressLookupHandle)) { +#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + // Our retry is going to be immediate, once the event loop spins. + NotifyRetryHandlers(error, remoteMprConfig, System::Clock::kZero); +#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES MoveToState(State::ResolvingAddress); return; } @@ -318,9 +337,11 @@ void OperationalSessionSetup::OnSessionEstablishmentError(CHIP_ERROR error) #if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES if (mRemainingAttempts > 0) { - CHIP_ERROR err = ScheduleSessionSetupReattempt(); + System::Clock::Seconds16 reattemptDelay; + CHIP_ERROR err = ScheduleSessionSetupReattempt(reattemptDelay); if (err == CHIP_NO_ERROR) { + NotifyRetryHandlers(error, remoteMprConfig, reattemptDelay); return; } } @@ -333,8 +354,8 @@ void OperationalSessionSetup::OnSessionEstablishmentError(CHIP_ERROR error) void OperationalSessionSetup::OnSessionEstablished(const SessionHandle & session) { - VerifyOrReturn(mState != State::Uninitialized, - ChipLogError(Discovery, "HandleCASEConnected was called while the device was not initialized")); + VerifyOrReturn(mState == State::Connecting, + ChipLogError(Discovery, "OnSessionEstablished was called while we were not connecting")); if (!mSecureSession.Grab(session)) return; // Got an invalid session, do not change any state @@ -489,7 +510,7 @@ void OperationalSessionSetup::UpdateAttemptCount(uint8_t attemptCount) } } -CHIP_ERROR OperationalSessionSetup::ScheduleSessionSetupReattempt() +CHIP_ERROR OperationalSessionSetup::ScheduleSessionSetupReattempt(System::Clock::Seconds16 & timerDelay) { VerifyOrDie(mRemainingAttempts > 0); // Try again, but not if things are in shutdown such that we can't get @@ -500,7 +521,6 @@ CHIP_ERROR OperationalSessionSetup::ScheduleSessionSetupReattempt() } MoveToState(State::NeedsAddress); - System::Clock::Seconds16 timerDelay; // Stop exponential backoff before our delays get too large. // // Note that mAttemptsDone is always > 0 here, because we have @@ -545,6 +565,27 @@ void OperationalSessionSetup::TrySetupAgain(System::Layer * systemLayer, void * self->DequeueConnectionCallbacks(err); // Do not touch `self` instance anymore; it has been destroyed in DequeueConnectionCallbacks. } + +void OperationalSessionSetup::AddRetryHandler(Callback::Callback * onRetry) +{ + mConnectionRetry.Enqueue(onRetry->Cancel()); +} + +void OperationalSessionSetup::NotifyRetryHandlers(CHIP_ERROR error, const ReliableMessageProtocolConfig & remoteMrpConfig, + System::Clock::Seconds16 retryDelay) +{ + // Compute the time we are likely to need to detect that the retry has + // failed. + System::Clock::Timeout messageTimeout = CASESession::ComputeSigma1ResponseTimeout(remoteMrpConfig); + auto timeoutSecs = std::chrono::duration_cast(messageTimeout); + // Add 1 second in case we had fractional milliseconds in messageTimeout. + timeoutSecs += System::Clock::Seconds16(1); + for (auto * item = mConnectionRetry.First(); item && item != &mConnectionRetry; item = item->mNext) + { + auto cb = Callback::Callback::FromCancelable(item); + cb->mCall(cb->mContext, mPeerId, error, timeoutSecs + retryDelay); + } +} #endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES } // namespace chip diff --git a/src/app/OperationalSessionSetup.h b/src/app/OperationalSessionSetup.h index 029d355e25ef21..653cb61a5682f7 100644 --- a/src/app/OperationalSessionSetup.h +++ b/src/app/OperationalSessionSetup.h @@ -36,8 +36,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -117,8 +119,20 @@ class OperationalDeviceProxy : public DeviceProxy * implementations as an example. */ typedef void (*OnDeviceConnected)(void * context, Messaging::ExchangeManager & exchangeMgr, const SessionHandle & sessionHandle); + +/** + * Callback prototype when secure session establishment fails. + */ typedef void (*OnDeviceConnectionFailure)(void * context, const ScopedNodeId & peerId, CHIP_ERROR error); +/** + * Callback prototype when secure session establishement has failed and will be + * retried. retryTimeout indicates how much time will pass before we know + * whether the retry has timed out waiting for a response to our Sigma1 message. + */ +typedef void (*OnDeviceConnectionRetry)(void * context, const ScopedNodeId & peerId, CHIP_ERROR error, + System::Clock::Seconds16 retryTimeout); + /** * Object used to either establish a connection to peer or performing address lookup to a peer. * @@ -227,6 +241,9 @@ class DLL_EXPORT OperationalSessionSetup : public SessionDelegate, #if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES // Update our remaining attempt count to be at least the given value. void UpdateAttemptCount(uint8_t attemptCount); + + // Add a retry handler for this session setup. + void AddRetryHandler(Callback::Callback * onRetry); #endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES private: @@ -268,6 +285,8 @@ class DLL_EXPORT OperationalSessionSetup : public SessionDelegate, #if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES uint8_t mRemainingAttempts = 0; uint8_t mAttemptsDone = 0; + + Callback::CallbackDeque mConnectionRetry; #endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES void MoveToState(State aTargetState); @@ -313,14 +332,22 @@ class DLL_EXPORT OperationalSessionSetup : public SessionDelegate, #if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES /** - * Schedule a setup reattempt, if possible. + * Schedule a setup reattempt, if possible. The outparam indicates how long + * it will be before the reattempt happens. */ - CHIP_ERROR ScheduleSessionSetupReattempt(); + CHIP_ERROR ScheduleSessionSetupReattempt(System::Clock::Seconds16 & timerDelay); /** * Helper for our backoff retry timer. */ static void TrySetupAgain(System::Layer * systemLayer, void * state); + + /** + * Helper to notify our retry callbacks that a setup error occurred and we + * will retry. + */ + void NotifyRetryHandlers(CHIP_ERROR error, const ReliableMessageProtocolConfig & remoteMrpConfig, + System::Clock::Seconds16 retryDelay); #endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES }; diff --git a/src/controller/CHIPDeviceController.cpp b/src/controller/CHIPDeviceController.cpp index f0caac0d738907..1cba88f8b97a0b 100644 --- a/src/controller/CHIPDeviceController.cpp +++ b/src/controller/CHIPDeviceController.cpp @@ -398,6 +398,9 @@ ControllerDeviceInitParams DeviceController::GetControllerDeviceInitParams() DeviceCommissioner::DeviceCommissioner() : mOnDeviceConnectedCallback(OnDeviceConnectedFn, this), mOnDeviceConnectionFailureCallback(OnDeviceConnectionFailureFn, this), +#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + mOnDeviceConnectionRetryCallback(OnDeviceConnectionRetryFn, this), +#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES mDeviceAttestationInformationVerificationCallback(OnDeviceAttestationInformationVerification, this), mDeviceNOCChainCallback(OnDeviceNOCChainGeneration, this), mSetUpCodePairer(this) { @@ -1731,6 +1734,60 @@ void DeviceCommissioner::OnDeviceConnectionFailureFn(void * context, const Scope } } +#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES +// No specific action to take on either success or failure here; we're just +// trying to bump the fail-safe, and if that fails it's not clear there's much +// we can to with that. +static void OnExtendFailsafeForCASERetryFailure(void * context, CHIP_ERROR error) +{ + ChipLogError(Controller, "Failed to extend fail-safe for CASE retry: %" CHIP_ERROR_FORMAT, error.Format()); +} +static void +OnExtendFailsafeForCASERetrySuccess(void * context, + const app::Clusters::GeneralCommissioning::Commands::ArmFailSafeResponse::DecodableType & data) +{ + ChipLogProgress(Controller, "Status of extending fail-safe for CASE retry: %u", to_underlying(data.errorCode)); +} + +void DeviceCommissioner::OnDeviceConnectionRetryFn(void * context, const ScopedNodeId & peerId, CHIP_ERROR error, + System::Clock::Seconds16 retryTimeout) +{ + ChipLogError(Controller, + "Session establishment failed for " ChipLogFormatScopedNodeId ", error: %" CHIP_ERROR_FORMAT + ". Next retry expected to get a response to Sigma1 or fail within %d seconds", + ChipLogValueScopedNodeId(peerId), error.Format(), retryTimeout.count()); + + auto self = static_cast(context); + + // We need to do the fail-safe arming over the PASE session. + auto * commissioneeDevice = self->FindCommissioneeDevice(peerId.GetNodeId()); + if (!commissioneeDevice) + { + // Commissioning canceled, presumably. Just ignore the notification, + // not much we can do here. + return; + } + + // Extend by the default failsafe timeout plus our retry timeout, so we can + // be sure the fail-safe will not expire before we try the next time, if + // there will be a next time. + // + // TODO: Make it possible for our clients to control the exact timeout here? + uint16_t failsafeTimeout; + if (UINT16_MAX - retryTimeout.count() < kDefaultFailsafeTimeout) + { + failsafeTimeout = UINT16_MAX; + } + else + { + failsafeTimeout = static_cast(retryTimeout.count() + kDefaultFailsafeTimeout); + } + self->ExtendArmFailSafe(commissioneeDevice, CommissioningStage::kFindOperational, failsafeTimeout, + MakeOptional(kMinimumCommissioningStepTimeout), OnExtendFailsafeForCASERetrySuccess, + OnExtendFailsafeForCASERetryFailure); +} +#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + // ClusterStateCache::Callback impl void DeviceCommissioner::OnDone(app::ReadClient *) { @@ -2444,7 +2501,7 @@ void DeviceCommissioner::PerformCommissioningStep(DeviceProxy * proxy, Commissio &mOnDeviceConnectionFailureCallback #if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES , - /* attemptCount = */ 3 + /* attemptCount = */ 3, &mOnDeviceConnectionRetryCallback #endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES ); } diff --git a/src/controller/CHIPDeviceController.h b/src/controller/CHIPDeviceController.h index d73950fd110b75..d60ad34768caf0 100644 --- a/src/controller/CHIPDeviceController.h +++ b/src/controller/CHIPDeviceController.h @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -787,6 +788,10 @@ class DLL_EXPORT DeviceCommissioner : public DeviceController, static void OnDeviceConnectedFn(void * context, Messaging::ExchangeManager & exchangeMgr, const SessionHandle & sessionHandle); static void OnDeviceConnectionFailureFn(void * context, const ScopedNodeId & peerId, CHIP_ERROR error); +#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + static void OnDeviceConnectionRetryFn(void * context, const ScopedNodeId & peerId, CHIP_ERROR error, + System::Clock::Seconds16 retryTimeout); +#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES static void OnDeviceAttestationInformationVerification(void * context, const Credentials::DeviceAttestationVerifier::AttestationInfo & info, @@ -897,6 +902,9 @@ class DLL_EXPORT DeviceCommissioner : public DeviceController, chip::Callback::Callback mOnDeviceConnectedCallback; chip::Callback::Callback mOnDeviceConnectionFailureCallback; +#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + chip::Callback::Callback mOnDeviceConnectionRetryCallback; +#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES chip::Callback::Callback mDeviceAttestationInformationVerificationCallback; diff --git a/src/protocols/secure_channel/CASESession.cpp b/src/protocols/secure_channel/CASESession.cpp index 3e3cbda767847d..ac931c7ba5f135 100644 --- a/src/protocols/secure_channel/CASESession.cpp +++ b/src/protocols/secure_channel/CASESession.cpp @@ -125,8 +125,9 @@ static_assert(sizeof(kTBEData2_Nonce) == sizeof(kTBEData3_Nonce), "TBEData2_Nonc // // The session establishment fails if the response is not received within the resulting timeout window, // which accounts for both transport latency and the server-side latency. -static constexpr ExchangeContext::Timeout kExpectedLowProcessingTime = System::Clock::Seconds16(2); -static constexpr ExchangeContext::Timeout kExpectedHighProcessingTime = System::Clock::Seconds16(30); +static constexpr ExchangeContext::Timeout kExpectedLowProcessingTime = System::Clock::Seconds16(2); +static constexpr ExchangeContext::Timeout kExpectedSigma1ProcessingTime = kExpectedLowProcessingTime; +static constexpr ExchangeContext::Timeout kExpectedHighProcessingTime = System::Clock::Seconds16(30); CASESession::~CASESession() { @@ -273,7 +274,7 @@ CHIP_ERROR CASESession::EstablishSession(SessionManager & sessionManager, Fabric mSessionResumptionStorage = sessionResumptionStorage; mLocalMRPConfig = mrpLocalConfig; - mExchangeCtxt->UseSuggestedResponseTimeout(kExpectedLowProcessingTime); + mExchangeCtxt->UseSuggestedResponseTimeout(kExpectedSigma1ProcessingTime); mPeerNodeId = peerScopedNodeId.GetNodeId(); mLocalNodeId = fabricInfo->GetNodeId(); @@ -1974,4 +1975,13 @@ CHIP_ERROR CASESession::OnMessageReceived(ExchangeContext * ec, const PayloadHea return err; } +System::Clock::Timeout CASESession::ComputeSigma1ResponseTimeout(const ReliableMessageProtocolConfig & remoteMrpConfig) +{ + return GetRetransmissionTimeout(remoteMrpConfig.mActiveRetransTimeout, remoteMrpConfig.mIdleRetransTimeout, + // Assume peer is idle, since that's what we + // will assume for our initial message. + System::Clock::kZero, Transport::kMinActiveTime) + + kExpectedSigma1ProcessingTime; +} + } // namespace chip diff --git a/src/protocols/secure_channel/CASESession.h b/src/protocols/secure_channel/CASESession.h index 1800fef789d775..9cf2bba6ee651d 100644 --- a/src/protocols/secure_channel/CASESession.h +++ b/src/protocols/secure_channel/CASESession.h @@ -35,11 +35,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -175,6 +177,10 @@ class DLL_EXPORT CASESession : public Messaging::UnsolicitedMessageHandler, FabricIndex GetFabricIndex() const { return mFabricIndex; } + // Compute our Sigma1 response timeout. This can give consumers an idea of + // how long it will take to detect that our Sigma1 did not get through. + static System::Clock::Timeout ComputeSigma1ResponseTimeout(const ReliableMessageProtocolConfig & remoteMrpConfig); + // TODO: remove Clear, we should create a new instance instead reset the old instance. /** @brief This function zeroes out and resets the memory used by the object. **/