From 3803635b7f44e984ac25d32cc25e15c9e4107779 Mon Sep 17 00:00:00 2001 From: Boris Zbarsky Date: Thu, 7 Mar 2024 16:46:21 -0500 Subject: [PATCH] Stop treating BUSY response as fatal when CASE re-attempts are enabled. (#32490) 1. We now communicate the requested delay to the session establishment delegate when we get a BUSY response. 2. OperationalSessionSetup, when it has CASE re-attempts enabled, treats a BUSY response as a trigger to reattempt, not a fatal error. 3. In CASEServer, set the requested delay to a better value if we have sent Sigma2 and are waiting for a response to that. Fixes https://github.com/project-chip/connectedhomeip/issues/28288 --- src/app/OperationalSessionSetup.cpp | 43 ++++++++++++++++--- src/app/OperationalSessionSetup.h | 3 ++ src/protocols/secure_channel/CASEServer.cpp | 28 ++++++++++-- src/protocols/secure_channel/CASESession.cpp | 7 ++- src/protocols/secure_channel/CASESession.h | 3 +- src/protocols/secure_channel/PASESession.cpp | 3 +- src/protocols/secure_channel/PASESession.h | 3 +- src/protocols/secure_channel/PairingSession.h | 12 ++++-- .../SessionEstablishmentDelegate.h | 10 +++++ 9 files changed, 97 insertions(+), 15 deletions(-) diff --git a/src/app/OperationalSessionSetup.cpp b/src/app/OperationalSessionSetup.cpp index 179e2a3df5e120..6df07ca6b0568e 100644 --- a/src/app/OperationalSessionSetup.cpp +++ b/src/app/OperationalSessionSetup.cpp @@ -430,7 +430,7 @@ void OperationalSessionSetup::OnSessionEstablishmentError(CHIP_ERROR error, Sess // member instead of having a boolean // mTryingNextResultDueToSessionEstablishmentError, so we can recover the // error in UpdateDeviceData. - if (CHIP_ERROR_TIMEOUT == error) + if (CHIP_ERROR_TIMEOUT == error || CHIP_ERROR_BUSY == error) { #if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES // Make a copy of the ReliableMessageProtocolConfig, since our @@ -480,6 +480,15 @@ void OperationalSessionSetup::OnSessionEstablishmentError(CHIP_ERROR error, Sess // Do not touch `this` instance anymore; it has been destroyed in DequeueConnectionCallbacks. } +void OperationalSessionSetup::OnResponderBusy(System::Clock::Milliseconds16 requestedDelay) +{ +#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + // Store the requested delay, so that we can use it for scheduling our + // retry. + mRequestedBusyDelay = requestedDelay; +#endif +} + void OperationalSessionSetup::OnSessionEstablished(const SessionHandle & session) { VerifyOrReturn(mState == State::Connecting, @@ -705,9 +714,22 @@ CHIP_ERROR OperationalSessionSetup::ScheduleSessionSetupReattempt(System::Clock: static_assert(UINT16_MAX / CHIP_DEVICE_CONFIG_AUTOMATIC_CASE_RETRY_INITIAL_DELAY_SECONDS >= (1 << CHIP_DEVICE_CONFIG_AUTOMATIC_CASE_RETRY_MAX_BACKOFF), "Our backoff calculation will overflow."); - timerDelay = System::Clock::Seconds16( + System::Clock::Timeout actualTimerDelay = System::Clock::Seconds16( static_cast(CHIP_DEVICE_CONFIG_AUTOMATIC_CASE_RETRY_INITIAL_DELAY_SECONDS << min((mAttemptsDone - 1), CHIP_DEVICE_CONFIG_AUTOMATIC_CASE_RETRY_MAX_BACKOFF))); + const bool responseWasBusy = mRequestedBusyDelay != System::Clock::kZero; + if (responseWasBusy) + { + if (mRequestedBusyDelay > actualTimerDelay) + { + actualTimerDelay = mRequestedBusyDelay; + } + + // Reset mRequestedBusyDelay now that we have consumed it, so it does + // not affect future reattempts not triggered by a busy response. + mRequestedBusyDelay = System::Clock::kZero; + } + if (mAttemptsDone % 2 == 0) { // It's possible that the other side received one of our Sigma1 messages @@ -716,11 +738,22 @@ CHIP_ERROR OperationalSessionSetup::ScheduleSessionSetupReattempt(System::Clock: // listening for Sigma1 messages again. // // To handle that, on every other retry, add the amount of time it would - // take the other side to time out. + // take the other side to time out. It would be nice if we could rely + // on the delay reported in a BUSY response to just tell us that value, + // but in practice for old devices BUSY often sends some hardcoded value + // that tells us nothing about when the other side will decide it has + // timed out. auto additionalTimeout = CASESession::ComputeSigma2ResponseTimeout(GetLocalMRPConfig().ValueOr(GetDefaultMRPConfig())); - timerDelay += std::chrono::duration_cast(additionalTimeout); + actualTimerDelay += additionalTimeout; } - CHIP_ERROR err = mInitParams.exchangeMgr->GetSessionManager()->SystemLayer()->StartTimer(timerDelay, TrySetupAgain, this); + timerDelay = std::chrono::duration_cast(actualTimerDelay); + + CHIP_ERROR err = mInitParams.exchangeMgr->GetSessionManager()->SystemLayer()->StartTimer(actualTimerDelay, TrySetupAgain, this); + + // TODO: If responseWasBusy, should we increment, mRemainingAttempts and + // mResolveAttemptsAllowed, since we were explicitly told to retry? Hard to + // tell what consumers expect out of a capped retry count here. + // The cast on count() is needed because the type count() returns might not // actually be uint16_t; on some platforms it's int. ChipLogProgress(Discovery, diff --git a/src/app/OperationalSessionSetup.h b/src/app/OperationalSessionSetup.h index 45b571a08aa633..9d24faad5efabb 100644 --- a/src/app/OperationalSessionSetup.h +++ b/src/app/OperationalSessionSetup.h @@ -227,6 +227,7 @@ class DLL_EXPORT OperationalSessionSetup : public SessionEstablishmentDelegate, //////////// SessionEstablishmentDelegate Implementation /////////////// void OnSessionEstablished(const SessionHandle & session) override; void OnSessionEstablishmentError(CHIP_ERROR error, SessionEstablishmentStage stage) override; + void OnResponderBusy(System::Clock::Milliseconds16 requestedDelay) override; ScopedNodeId GetPeerId() const { return mPeerId; } @@ -319,6 +320,8 @@ class DLL_EXPORT OperationalSessionSetup : public SessionEstablishmentDelegate, uint8_t mResolveAttemptsAllowed = 0; + System::Clock::Milliseconds16 mRequestedBusyDelay = System::Clock::kZero; + Callback::CallbackDeque mConnectionRetry; #endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES diff --git a/src/protocols/secure_channel/CASEServer.cpp b/src/protocols/secure_channel/CASEServer.cpp index 2ad196a31b9461..df0984d4d94eee 100644 --- a/src/protocols/secure_channel/CASEServer.cpp +++ b/src/protocols/secure_channel/CASEServer.cpp @@ -90,9 +90,31 @@ CHIP_ERROR CASEServer::OnMessageReceived(Messaging::ExchangeContext * ec, const // Handshake wasn't stuck, send the busy status report and let the existing handshake continue. // A successful CASE handshake can take several seconds and some may time out (30 seconds or more). - // TODO: Come up with better estimate: https://github.com/project-chip/connectedhomeip/issues/28288 - // For now, setting minimum wait time to 5000 milliseconds. - CHIP_ERROR err = SendBusyStatusReport(ec, System::Clock::Milliseconds16(5000)); + + System::Clock::Milliseconds16 delay = System::Clock::kZero; + if (GetSession().GetState() == CASESession::State::kSentSigma2) + { + // The delay should be however long we think it will take for + // that to time out. + auto sigma2Timeout = CASESession::ComputeSigma2ResponseTimeout(GetSession().GetRemoteMRPConfig()); + if (sigma2Timeout < System::Clock::Milliseconds16::max()) + { + delay = std::chrono::duration_cast(sigma2Timeout); + } + else + { + // Avoid overflow issues, just wait for as long as we can to + // get close to our expected Sigma2 timeout. + delay = System::Clock::Milliseconds16::max(); + } + } + else + { + // For now, setting minimum wait time to 5000 milliseconds if we + // have no other information. + delay = System::Clock::Milliseconds16(5000); + } + CHIP_ERROR err = SendBusyStatusReport(ec, delay); if (err != CHIP_NO_ERROR) { ChipLogError(Inet, "Failed to send the busy status report, err:%" CHIP_ERROR_FORMAT, err.Format()); diff --git a/src/protocols/secure_channel/CASESession.cpp b/src/protocols/secure_channel/CASESession.cpp index 30d1f71cb0ff64..a9b62489783794 100644 --- a/src/protocols/secure_channel/CASESession.cpp +++ b/src/protocols/secure_channel/CASESession.cpp @@ -1966,7 +1966,8 @@ void CASESession::OnSuccessStatusReport() Finish(); } -CHIP_ERROR CASESession::OnFailureStatusReport(Protocols::SecureChannel::GeneralStatusCode generalCode, uint16_t protocolCode) +CHIP_ERROR CASESession::OnFailureStatusReport(Protocols::SecureChannel::GeneralStatusCode generalCode, uint16_t protocolCode, + Optional protocolData) { CHIP_ERROR err = CHIP_NO_ERROR; switch (protocolCode) @@ -1981,6 +1982,10 @@ CHIP_ERROR CASESession::OnFailureStatusReport(Protocols::SecureChannel::GeneralS case kProtocolCodeBusy: err = CHIP_ERROR_BUSY; + if (protocolData.HasValue()) + { + mDelegate->OnResponderBusy(System::Clock::Milliseconds16(static_cast(protocolData.Value()))); + } break; default: diff --git a/src/protocols/secure_channel/CASESession.h b/src/protocols/secure_channel/CASESession.h index cb7f3c7f13380d..9e41f6c69fbe84 100644 --- a/src/protocols/secure_channel/CASESession.h +++ b/src/protocols/secure_channel/CASESession.h @@ -272,7 +272,8 @@ class DLL_EXPORT CASESession : public Messaging::UnsolicitedMessageHandler, const ByteSpan & skInfo, const ByteSpan & nonce); void OnSuccessStatusReport() override; - CHIP_ERROR OnFailureStatusReport(Protocols::SecureChannel::GeneralStatusCode generalCode, uint16_t protocolCode) override; + CHIP_ERROR OnFailureStatusReport(Protocols::SecureChannel::GeneralStatusCode generalCode, uint16_t protocolCode, + Optional protocolData) override; void AbortPendingEstablish(CHIP_ERROR err); diff --git a/src/protocols/secure_channel/PASESession.cpp b/src/protocols/secure_channel/PASESession.cpp index 50b186100841e0..40dc67793604e2 100644 --- a/src/protocols/secure_channel/PASESession.cpp +++ b/src/protocols/secure_channel/PASESession.cpp @@ -759,7 +759,8 @@ void PASESession::OnSuccessStatusReport() Finish(); } -CHIP_ERROR PASESession::OnFailureStatusReport(Protocols::SecureChannel::GeneralStatusCode generalCode, uint16_t protocolCode) +CHIP_ERROR PASESession::OnFailureStatusReport(Protocols::SecureChannel::GeneralStatusCode generalCode, uint16_t protocolCode, + Optional protocolData) { CHIP_ERROR err = CHIP_NO_ERROR; switch (protocolCode) diff --git a/src/protocols/secure_channel/PASESession.h b/src/protocols/secure_channel/PASESession.h index 393d3a65fc958a..e270baf42e80f1 100644 --- a/src/protocols/secure_channel/PASESession.h +++ b/src/protocols/secure_channel/PASESession.h @@ -203,7 +203,8 @@ class DLL_EXPORT PASESession : public Messaging::UnsolicitedMessageHandler, CHIP_ERROR HandleMsg3(System::PacketBufferHandle && msg); void OnSuccessStatusReport() override; - CHIP_ERROR OnFailureStatusReport(Protocols::SecureChannel::GeneralStatusCode generalCode, uint16_t protocolCode) override; + CHIP_ERROR OnFailureStatusReport(Protocols::SecureChannel::GeneralStatusCode generalCode, uint16_t protocolCode, + Optional protocolData) override; void Finish(); diff --git a/src/protocols/secure_channel/PairingSession.h b/src/protocols/secure_channel/PairingSession.h index 8ed9f269b9a32c..f49dbf7997f4e2 100644 --- a/src/protocols/secure_channel/PairingSession.h +++ b/src/protocols/secure_channel/PairingSession.h @@ -26,6 +26,7 @@ #pragma once #include +#include #include #include #include @@ -129,7 +130,11 @@ class DLL_EXPORT PairingSession : public SessionDelegate void SetPeerSessionId(uint16_t id) { mPeerSessionId.SetValue(id); } virtual void OnSuccessStatusReport() {} - virtual CHIP_ERROR OnFailureStatusReport(Protocols::SecureChannel::GeneralStatusCode generalCode, uint16_t protocolCode) + + // Handle a failure StatusReport message from the server. protocolData will + // depend on exactly what the generalCode/protocolCode are. + virtual CHIP_ERROR OnFailureStatusReport(Protocols::SecureChannel::GeneralStatusCode generalCode, uint16_t protocolCode, + Optional protocolData) { return CHIP_ERROR_INTERNAL; } @@ -174,6 +179,7 @@ class DLL_EXPORT PairingSession : public SessionDelegate return CHIP_NO_ERROR; } + Optional protocolData; if (report.GetGeneralCode() == Protocols::SecureChannel::GeneralStatusCode::kBusy && report.GetProtocolCode() == Protocols::SecureChannel::kProtocolCodeBusy) { @@ -189,15 +195,15 @@ class DLL_EXPORT PairingSession : public SessionDelegate } else { - // TODO: CASE: Notify minimum wait time to clients on receiving busy status report #28290 ChipLogProgress(SecureChannel, "Received busy status report with minimum wait time: %u ms", minimumWaitTime); + protocolData.Emplace(minimumWaitTime); } } } // It's very important that we propagate the return value from // OnFailureStatusReport out to the caller. Make sure we return it directly. - return OnFailureStatusReport(report.GetGeneralCode(), report.GetProtocolCode()); + return OnFailureStatusReport(report.GetGeneralCode(), report.GetProtocolCode(), protocolData); } /** diff --git a/src/protocols/secure_channel/SessionEstablishmentDelegate.h b/src/protocols/secure_channel/SessionEstablishmentDelegate.h index a50949e6f6b2e8..c0ba56d9b01e09 100644 --- a/src/protocols/secure_channel/SessionEstablishmentDelegate.h +++ b/src/protocols/secure_channel/SessionEstablishmentDelegate.h @@ -25,6 +25,7 @@ #pragma once +#include #include #include #include @@ -80,6 +81,15 @@ class DLL_EXPORT SessionEstablishmentDelegate */ virtual void OnSessionEstablished(const SessionHandle & session) {} + /** + * Called when the responder has responded with a "busy" status code and + * provided a requested delay. + * + * This call will be followed by an OnSessionEstablishmentError with + * CHIP_ERROR_BUSY as the error. + */ + virtual void OnResponderBusy(System::Clock::Milliseconds16 requestedDelay) {} + virtual ~SessionEstablishmentDelegate() {} };