-
Notifications
You must be signed in to change notification settings - Fork 5.5k
[WIP] Expose Rest API to return metrics in prometheus format #21599
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
1d279f8
49992f3
4d489a4
c739d83
5c510e7
3ce8d6c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,64 @@ | ||
| /* | ||
| * Licensed under the Apache License, Version 2.0 (the "License"); | ||
| * you may not use this file except in compliance with the License. | ||
| * You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| #include "StatsReporterImpl.h" | ||
|
|
||
| namespace facebook::presto { | ||
|
|
||
| void StatsReporterImpl::registerMetricExportType( | ||
| folly::StringPiece key, | ||
| facebook::velox::StatType statType) const { | ||
| registerMetricExportType(key.start(), statType); | ||
| } | ||
|
|
||
| void StatsReporterImpl::registerMetricExportType( | ||
| const char* key, | ||
| facebook::velox::StatType statType) const { | ||
| std::lock_guard<std::mutex> lock(mutex_); | ||
| registeredStats_.emplace(key, statType); | ||
| metricsMap_.emplace(key, 0); | ||
| } | ||
|
|
||
| void StatsReporterImpl::addMetricValue(const char* key, size_t value) const { | ||
| std::lock_guard<std::mutex> lock(mutex_); | ||
| auto it = registeredStats_.find(key); | ||
| if (it == registeredStats_.end()) { | ||
| VLOG(1) << "addMetricValue() for unregistered stat " << key; | ||
| return; | ||
| } | ||
| if (it->second == facebook::velox::StatType::COUNT) { | ||
| // increment the counter. | ||
| metricsMap_[key] += value; | ||
| return; | ||
| } | ||
| // Gauge type metric value must be reset. | ||
| metricsMap_[key] = value; | ||
| } | ||
|
|
||
| void StatsReporterImpl::addMetricValue(const std::string& key, size_t value) | ||
| const { | ||
| addMetricValue(key.c_str(), value); | ||
| } | ||
|
|
||
| void StatsReporterImpl::addMetricValue(folly::StringPiece key, size_t value) | ||
| const { | ||
| addMetricValue(key.start(), value); | ||
| } | ||
|
|
||
| const std::string StatsReporterImpl::getMetrics( | ||
| const MetricsSerializer& serializer) { | ||
| std::lock_guard<std::mutex> lock(mutex_); | ||
| return serializer.serialize(registeredStats_, metricsMap_); | ||
| } | ||
| } // namespace facebook::presto |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,173 @@ | ||
| /* | ||
| * Licensed under the Apache License, Version 2.0 (the "License"); | ||
| * you may not use this file except in compliance with the License. | ||
| * You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| #include <folly/dynamic.h> | ||
| #include <fstream> | ||
| #include <iostream> | ||
| #include "presto_cpp/main/common/Configs.h" | ||
| #include "presto_cpp/main/common/Counters.h" | ||
| #include "velox/common/base/Exceptions.h" | ||
| #include "velox/common/base/StatsReporter.h" | ||
|
|
||
| namespace facebook::presto { | ||
|
|
||
| class MetricsSerializer { | ||
| public: | ||
| virtual std::string serialize( | ||
| const std::unordered_map<std::string, facebook::velox::StatType>& | ||
| metricStatTypes, | ||
| const std::unordered_map<std::string, size_t>& metricValues) const = 0; | ||
| }; | ||
|
|
||
| namespace prometheus { | ||
| using Labels = std::unordered_map<std::string, std::string>; | ||
| class PrometheusSerializer : public MetricsSerializer { | ||
| public: | ||
| explicit PrometheusSerializer(const Labels& labels) : labels_(labels) {} | ||
|
|
||
| std::string serialize( | ||
| const std::unordered_map<std::string, facebook::velox::StatType>& | ||
| metricStatTypes, | ||
| const std::unordered_map<std::string, size_t>& metricValues) const { | ||
| std::stringstream ss; | ||
| for (const auto metric : metricValues) { | ||
| auto metricName = metric.first; | ||
| std::replace(metricName.begin(), metricName.end(), '.', '_'); | ||
| auto statType = metricStatTypes.find(metric.first)->second; | ||
| ss << "# HELP " << metricName << std::endl; | ||
| std::string statTypeStr = "gauge"; | ||
| if (statType == facebook::velox::StatType::COUNT) { | ||
| statTypeStr = "counter"; | ||
| } | ||
| ss << "# TYPE " << metricName << " " << statTypeStr << std::endl; | ||
| int i = 0; | ||
| ss << metricName << "{"; | ||
| for (auto& label : labels_) { | ||
| ss << label.first << "=\"" << label.second << "\""; | ||
| if (i < labels_.size() - 1) { | ||
| // Comma separated labels. | ||
| ss << ","; | ||
| } | ||
| ++i; | ||
| } | ||
| ss << "} " << metric.second << std::endl; | ||
| } | ||
| return ss.str(); | ||
| } | ||
|
|
||
| private: | ||
| // A map of labels assigned to each metric which helps in filtering at client | ||
| // end. | ||
| const Labels labels_; | ||
| }; | ||
| } // namespace prometheus. | ||
|
|
||
| /// An implementation of BaseStatsReporter which gathers runtime metrics and | ||
| /// and maintains them in-memory. Users can call | ||
| /// StatsReporterImpl::getMetrics(MetricSerializer) to get metrics in custom | ||
| /// formatted string. | ||
| class StatsReporterImpl : public facebook::velox::BaseStatsReporter { | ||
| public: | ||
| StatsReporterImpl( | ||
| const std::string cluster = "", | ||
| const std::string worker = "") { | ||
| if (cluster.empty()) { | ||
| auto nodeConfig = facebook::presto::NodeConfig::instance(); | ||
| cluster_ = nodeConfig->nodeEnvironment(); | ||
| } else { | ||
| cluster_ = cluster; | ||
| } | ||
| char* hostName = std::getenv("HOSTNAME"); | ||
| workerPod_ = !hostName ? worker : hostName; | ||
| } | ||
|
|
||
| /// Register a stat of the given stat type. | ||
| /// @param key The key to identify the stat. | ||
| /// @param statType How the stat is aggregated. | ||
| void registerMetricExportType( | ||
| const char* key, | ||
| facebook::velox::StatType statType) const override; | ||
|
|
||
| void registerMetricExportType( | ||
| folly::StringPiece key, | ||
| facebook::velox::StatType statType) const override; | ||
|
|
||
| void registerHistogramMetricExportType( | ||
| const char* /*key*/, | ||
| int64_t /* bucketWidth */, | ||
| int64_t /* min */, | ||
| int64_t /* max */, | ||
| const std::vector<int32_t>& /* pcts */) const override {} | ||
|
|
||
| void registerHistogramMetricExportType( | ||
| folly::StringPiece /* key */, | ||
| int64_t /* bucketWidth */, | ||
| int64_t /* min */, | ||
| int64_t /* max */, | ||
| const std::vector<int32_t>& /* pcts */) const override {} | ||
|
|
||
| void addMetricValue(const std::string& key, size_t value = 1) const override; | ||
|
|
||
| void addMetricValue(const char* key, size_t value = 1) const override; | ||
|
|
||
| void addMetricValue(folly::StringPiece key, size_t value = 1) const override; | ||
|
|
||
| void addHistogramMetricValue(const std::string& key, size_t value) | ||
| const override {} | ||
|
|
||
| void addHistogramMetricValue(const char* key, size_t value) const override {} | ||
|
|
||
| void addHistogramMetricValue(folly::StringPiece key, size_t value) | ||
| const override {} | ||
|
|
||
| const facebook::velox::StatType getRegisteredStatType( | ||
| const std::string& metricName) { | ||
| std::lock_guard<std::mutex> lock(mutex_); | ||
| return registeredStats_[metricName]; | ||
| } | ||
|
|
||
| /* | ||
| * Serializes the metrics collected so far in the format suitable for | ||
| * back filling Prometheus server. | ||
| * | ||
| * Given a metric name and a set of labels, time series are frequently | ||
| * identified using this notation: | ||
| * | ||
| * <metric name>{<label name>=<label value>, ...} | ||
| * | ||
| * For example, a time series with the metric name num_tasks_aborted | ||
| * and the labels cluster="<cluster_id>" and worker="worker-id" | ||
| * could be written like this: | ||
| * # HELP num_tasks_aborted | ||
| * # TYPE num_tasks_aborted gauge* | ||
| * num_tasks_aborted{cluster="<cluster_id>", worker="worker-id"} value | ||
| * timestamp | ||
| * | ||
| * Above info is from: | ||
| * https://prometheus.io/docs/concepts/data_model/#metric-names-and-labels | ||
| */ | ||
| const std::string getMetrics(const MetricsSerializer& serializer); | ||
|
|
||
| private: | ||
| /// Mapping of registered stats key to StatType. | ||
| mutable std::unordered_map<std::string, facebook::velox::StatType> | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use mutable folly::ConcurrentHashMap
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we normally use |
||
| registeredStats_; | ||
| /// A mapping from stats key of type COUNT to value. | ||
| mutable std::unordered_map<std::string, size_t> metricsMap_; | ||
| // Mutex to control access to registeredStats_ and metricMap_ members. | ||
| mutable std::mutex mutex_; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do not need mutex |
||
| std::string cluster_; | ||
| std::string workerPod_; | ||
| }; // class StatsReporterImpl | ||
| } // namespace facebook::presto | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,8 @@ | ||
| find_package(prometheus-cpp CONFIG REQUIRED) | ||
| add_library(prometheus_reporter PrometheusReporter.cpp PrometheusReporter.h) | ||
| target_link_libraries(prometheus_reporter presto_common prometheus-cpp::core) | ||
| set_property(TARGET prometheus_reporter PROPERTY JOB_POOL_LINK | ||
| presto_link_job_pool) | ||
| add_executable(prometheus_reporter_test PrometheusReporterTest.cpp) | ||
| target_link_libraries(prometheus_reporter_test presto_server_lib | ||
| velox_exec_test_lib prometheus_reporter gtest gtest_main) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we need this? If we just define an exposer that will create another http server on /metrics or /v1/metrics? We can also define the number of threads to be used by the new http server
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That is exactly what we are trying to avoid. We didn't find a strong need to start a new server sharing space with our worker containers. Reduces the overhead of maintaining and launching another process. Why do you want multiple threads to be spawned by the exposer? are you expecting high traffic.
CMIW, prometheus server is configured to periodically call the scrape endpoint. So, we can expect 1 HTTP request at X seconds interval. If you are expecting huge traffic, then it is not recommended to share space with worker instance.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think isolation is also one point, we do not want to degrade the presto process as much as we can
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@amitkdutta : What approach did Meta take for this ? Does your metric collection use any endpoint in Presto worker process itself, or did you start another server ?