diff --git a/include/istio/mixerclient/client.h b/include/istio/mixerclient/client.h index 69233731558..648c5588636 100644 --- a/include/istio/mixerclient/client.h +++ b/include/istio/mixerclient/client.h @@ -114,9 +114,17 @@ struct Statistics { // // Total number of report calls. - uint64_t total_report_calls_{0}; + uint64_t total_report_calls_{0}; // 1.0 // Total number of remote report calls. - uint64_t total_remote_report_calls_{0}; + uint64_t total_remote_report_calls_{0}; // 1.0 + // Remote report calls that succeeed + uint64_t total_remote_report_successes_{0}; // 1.1 + // Remote report calls that fail due to timeout waiting for the response + uint64_t total_remote_report_timeouts_{0}; // 1.1 + // Remote report calls that fail sending the request (socket connect or write) + uint64_t total_remote_report_send_errors_{0}; // 1.1 + // Remote report calls that fail do to some other error + uint64_t total_remote_report_other_errors_{0}; // 1.1 }; class MixerClient { diff --git a/src/envoy/utils/stats.cc b/src/envoy/utils/stats.cc index eec09c69efb..3e316a06b00 100644 --- a/src/envoy/utils/stats.cc +++ b/src/envoy/utils/stats.cc @@ -87,6 +87,10 @@ void MixerStatsObject::CheckAndUpdateStats( CHECK_AND_UPDATE_STATS(total_report_calls_); CHECK_AND_UPDATE_STATS(total_remote_report_calls_); + CHECK_AND_UPDATE_STATS(total_remote_report_successes_); + CHECK_AND_UPDATE_STATS(total_remote_report_timeouts_); + CHECK_AND_UPDATE_STATS(total_remote_report_send_errors_); + CHECK_AND_UPDATE_STATS(total_remote_report_other_errors_); // Copy new_stats to old_stats_ for next stats update. old_stats_ = new_stats; diff --git a/src/envoy/utils/stats.h b/src/envoy/utils/stats.h index b9fc3096159..e1f93510dea 100644 --- a/src/envoy/utils/stats.h +++ b/src/envoy/utils/stats.h @@ -53,7 +53,11 @@ namespace Utils { COUNTER(total_remote_call_retries) \ COUNTER(total_remote_call_cancellations) \ COUNTER(total_report_calls) \ - COUNTER(total_remote_report_calls) + COUNTER(total_remote_report_calls) \ + COUNTER(total_remote_report_successes) \ + COUNTER(total_remote_report_timeouts) \ + COUNTER(total_remote_report_send_errors) \ + COUNTER(total_remote_report_other_errors) // clang-format on /** diff --git a/src/istio/mixerclient/BUILD b/src/istio/mixerclient/BUILD index 129ff01116f..7a0f0194135 100644 --- a/src/istio/mixerclient/BUILD +++ b/src/istio/mixerclient/BUILD @@ -40,7 +40,7 @@ cc_library( "attribute_compressor.h", "check_cache.cc", "check_cache.h", - "check_context.h", + "check_context.h", "client_impl.cc", "client_impl.h", "global_dictionary.cc", @@ -51,7 +51,9 @@ cc_library( "referenced.h", "report_batch.cc", "report_batch.h", - "shared_attributes.h", + "shared_attributes.h", + "status_util.cc", + "status_util.h" ], visibility = ["//visibility:public"], deps = [ diff --git a/src/istio/mixerclient/client_impl.cc b/src/istio/mixerclient/client_impl.cc index 3a5e9a0b1b5..966b09b679c 100644 --- a/src/istio/mixerclient/client_impl.cc +++ b/src/istio/mixerclient/client_impl.cc @@ -18,6 +18,7 @@ #include #include "include/istio/mixerclient/check_response.h" #include "include/istio/utils/protobuf.h" +#include "src/istio/mixerclient/status_util.h" #include "src/istio/utils/logger.h" using ::google::protobuf::util::Status; @@ -33,36 +34,6 @@ using ::istio::mixerclient::SharedAttributesSharedPtr; namespace istio { namespace mixerclient { -static ::google::protobuf::StringPiece TIMEOUT_MESSAGE( - "upstream request timeout"); -static ::google::protobuf::StringPiece SEND_ERROR_MESSAGE( - "upstream connect error or disconnect/reset before headers"); - -enum class TransportResult { - SUCCESS, // Response received - SEND_ERROR, // Cannot connect to peer or send request to peer. - RESPONSE_TIMEOUT, // Connected to peer and sent request, but didn't receive a - // response in time. - OTHER // Something else went wrong -}; - -TransportResult TransportStatus(const Status &status) { - if (status.ok()) { - return TransportResult::SUCCESS; - } - - if (Code::UNAVAILABLE == status.error_code()) { - if (TIMEOUT_MESSAGE == status.error_message()) { - return TransportResult::RESPONSE_TIMEOUT; - } - if (SEND_ERROR_MESSAGE == status.error_message()) { - return TransportResult::SEND_ERROR; - } - } - - return TransportResult::OTHER; -} - MixerClientImpl::MixerClientImpl(const MixerClientOptions &options) : options_(options) { timer_create_ = options.env.timer_create_func; @@ -359,6 +330,14 @@ void MixerClientImpl::GetStatistics(Statistics *stat) const { stat->total_report_calls_ = report_batch_->total_report_calls(); stat->total_remote_report_calls_ = report_batch_->total_remote_report_calls(); + stat->total_remote_report_successes_ = + report_batch_->total_remote_report_successes(); + stat->total_remote_report_timeouts_ = + report_batch_->total_remote_report_timeouts(); + stat->total_remote_report_send_errors_ = + report_batch_->total_remote_report_send_errors(); + stat->total_remote_report_other_errors_ = + report_batch_->total_remote_report_other_errors(); } // Creates a MixerClient object. diff --git a/src/istio/mixerclient/report_batch.cc b/src/istio/mixerclient/report_batch.cc index 0b2ea360e67..7d344089eec 100644 --- a/src/istio/mixerclient/report_batch.cc +++ b/src/istio/mixerclient/report_batch.cc @@ -15,6 +15,7 @@ #include "src/istio/mixerclient/report_batch.h" #include "include/istio/utils/protobuf.h" +#include "src/istio/mixerclient/status_util.h" #include "src/istio/utils/logger.h" using ::google::protobuf::util::Status; @@ -71,12 +72,34 @@ void ReportBatch::FlushWithLock() { ++total_remote_report_calls_; auto request = batch_compressor_->Finish(); - ReportResponse* response = new ReportResponse; + std::shared_ptr response{new ReportResponse()}; + + // TODO(jblatt) I replaced a ReportResponse raw pointer with a shared + // pointer so at least the memory will be freed if this lambda is deleted + // without being called, but really this should be a unique_ptr that is + // moved into the transport_ and then moved into the lambda if invoked. + transport_(request, &*response, [this, response](const Status& status) { + // + // Classify and track transport errors + // + + TransportResult result = TransportStatus(status); + + switch (result) { + case TransportResult::SUCCESS: + ++total_remote_report_successes_; + break; + case TransportResult::RESPONSE_TIMEOUT: + ++total_remote_report_timeouts_; + break; + case TransportResult::SEND_ERROR: + ++total_remote_report_send_errors_; + break; + case TransportResult::OTHER: + ++total_remote_report_other_errors_; + break; + } - // TODO(jblatt) should an async call be made while this lock is held? Can the - // request send block()? - transport_(request, response, [this, response](const Status& status) { - delete response; if (!status.ok()) { if (MIXER_WARN_ENABLED && 0 == REPORT_FAIL_LOG_MESSAGES++ % REPORT_FAIL_LOG_MODULUS) { diff --git a/src/istio/mixerclient/report_batch.h b/src/istio/mixerclient/report_batch.h index 60dfe1de22d..3ce9aaca004 100644 --- a/src/istio/mixerclient/report_batch.h +++ b/src/istio/mixerclient/report_batch.h @@ -44,6 +44,22 @@ class ReportBatch { return total_remote_report_calls_; } + uint64_t total_remote_report_successes() const { + return total_remote_report_successes_; + } + + uint64_t total_remote_report_timeouts() const { + return total_remote_report_timeouts_; + } + + uint64_t total_remote_report_send_errors() const { + return total_remote_report_send_errors_; + } + + uint64_t total_remote_report_other_errors() const { + return total_remote_report_other_errors_; + } + private: void FlushWithLock(); @@ -68,8 +84,12 @@ class ReportBatch { // batched report compressor std::unique_ptr batch_compressor_; - std::atomic_int_fast64_t total_report_calls_; - std::atomic_int_fast64_t total_remote_report_calls_; + std::atomic total_report_calls_{0}; // 1.0 + std::atomic total_remote_report_calls_{0}; // 1.0 + std::atomic total_remote_report_successes_{0}; // 1.1 + std::atomic total_remote_report_timeouts_{0}; // 1.1 + std::atomic total_remote_report_send_errors_{0}; // 1.1 + std::atomic total_remote_report_other_errors_{0}; // 1.1 GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ReportBatch); }; diff --git a/src/istio/mixerclient/status_util.cc b/src/istio/mixerclient/status_util.cc new file mode 100644 index 00000000000..bee88008827 --- /dev/null +++ b/src/istio/mixerclient/status_util.cc @@ -0,0 +1,45 @@ +/* Copyright 2019 Istio Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/istio/mixerclient/status_util.h" + +namespace istio { +namespace mixerclient { + +static ::google::protobuf::StringPiece TIMEOUT_MESSAGE( + "upstream request timeout"); +static ::google::protobuf::StringPiece SEND_ERROR_MESSAGE( + "upstream connect error or disconnect/reset before headers"); + +TransportResult TransportStatus( + const ::google::protobuf::util::Status &status) { + if (status.ok()) { + return TransportResult::SUCCESS; + } + + if (::google::protobuf::util::error::Code::UNAVAILABLE == + status.error_code()) { + if (TIMEOUT_MESSAGE == status.error_message()) { + return TransportResult::RESPONSE_TIMEOUT; + } + if (SEND_ERROR_MESSAGE == status.error_message()) { + return TransportResult::SEND_ERROR; + } + } + + return TransportResult::OTHER; +} +} // namespace mixerclient +} // namespace istio diff --git a/src/istio/mixerclient/status_util.h b/src/istio/mixerclient/status_util.h new file mode 100644 index 00000000000..843d2745764 --- /dev/null +++ b/src/istio/mixerclient/status_util.h @@ -0,0 +1,35 @@ +/* Copyright 2019 Istio Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "google/protobuf/stubs/status.h" + +namespace istio { +namespace mixerclient { + +enum class TransportResult { + SUCCESS, // Response received + SEND_ERROR, // Cannot connect to peer or send request to peer. + RESPONSE_TIMEOUT, // Connected to peer and sent request, but didn't receive a + // response in time. + OTHER // Something else went wrong +}; + +extern TransportResult TransportStatus( + const ::google::protobuf::util::Status &status); + +} // namespace mixerclient +} // namespace istio diff --git a/test/integration/mixer_fault_test.cc b/test/integration/mixer_fault_test.cc index 302e00e1117..6384b200b4b 100644 --- a/test/integration/mixer_fault_test.cc +++ b/test/integration/mixer_fault_test.cc @@ -630,6 +630,9 @@ TEST_F(MixerFaultTest, TolerateTelemetryBlackhole) { client->run(connections_to_initiate, requests_to_send, std::move(request), std::chrono::milliseconds(10'000)); + std::unordered_map counters; + extractCounters("http_mixer_filter", counters); + // shutdown envoy by destroying it test_server_ = nullptr; // wait until the upstreams have closed all connections they accepted. @@ -674,6 +677,17 @@ TEST_F(MixerFaultTest, TolerateTelemetryBlackhole) { // Telemetry server accept callback is called at least once (h2 socket reuse // means may only be called once) EXPECT_GE(telemetry_cluster.connectionsAccepted(), 1); + + // Assertions against the mixer filter's internal counters. + EXPECT_EQ(counters["http_mixer_filter.total_report_calls"], requests_to_send); + EXPECT_IN_RANGE(counters["http_mixer_filter.total_remote_report_calls_"], 0, + requests_to_send * 0.12); + // All remote reports should time out + EXPECT_EQ(counters["http_mixer_filter.total_remote_report_timeouts_"], + counters["http_mixer_filter.total_remote_report_calls_"]); + EXPECT_EQ(counters["http_mixer_filter.total_remote_report_successes_"], 0); + EXPECT_EQ(counters["http_mixer_filter.total_remote_report_send_errors_"], 0); + EXPECT_EQ(counters["http_mixer_filter.total_remote_report_other_errors_"], 0); } TEST_F(MixerFaultTest, FailOpenAndSendPolicyResponseSlowly) {