From 94394629e0146617302d7616a813adfb79be445d Mon Sep 17 00:00:00 2001
From: Eric Richardson <e@ericrichardson.com>
Date: Tue, 5 Jan 2016 10:23:51 -0500
Subject: [PATCH] Update stats structs to support ES 2.0; Add Thread Pool stats

* Addresses #8
* Update README to note changes in ES 2.0
* Add stats for thread pool
---
 Makefile                  |  2 +-
 elasticsearch_exporter.go | 81 +++++++++++++++++++++++++++++++--------
 struct.go                 | 48 ++++++++++++++---------
 3 files changed, 95 insertions(+), 36 deletions(-)

diff --git a/Makefile b/Makefile
index 55ef0296..b8f4c36d 100644
--- a/Makefile
+++ b/Makefile
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-VERSION  := 0.2.1
+VERSION  := 0.3.0
 TARGET   := elasticsearch_exporter
 
 include Makefile.COMMON
diff --git a/elasticsearch_exporter.go b/elasticsearch_exporter.go
index 47373c64..f908dbfb 100644
--- a/elasticsearch_exporter.go
+++ b/elasticsearch_exporter.go
@@ -26,22 +26,27 @@ type VecInfo struct {
 
 var (
 	gaugeMetrics = map[string]string{
-		"indices_fielddata_memory_size_bytes":    "Field data cache memory usage in bytes",
-		"indices_filter_cache_memory_size_bytes": "Filter cache memory usage in bytes",
-		"indices_docs":                           "Count of documents on this node",
-		"indices_docs_deleted":                   "Count of deleted documents on this node",
-		"indices_store_size_bytes":               "Current size of stored index data in bytes",
-		"indices_segments_memory_bytes":          "Current memory size of segments in bytes",
-		"indices_segments_count":                 "Count of index segments on this node",
-		"process_cpu_percent":                    "Percent CPU used by process",
-		"process_mem_resident_size_bytes":        "Resident memory in use by process in bytes",
-		"process_mem_share_size_bytes":           "Shared memory in use by process in bytes",
-		"process_mem_virtual_size_bytes":         "Total virtual memory used in bytes",
-		"process_open_files_count":               "Open file descriptors",
+		"indices_fielddata_memory_size_bytes":     "Field data cache memory usage in bytes",
+		"indices_filter_cache_memory_size_bytes":  "Filter cache memory usage in bytes",
+		"indices_query_cache_memory_size_bytes":   "Query cache memory usage in bytes",
+		"indices_request_cache_memory_size_bytes": "Request cache memory usage in bytes",
+		"indices_docs":                            "Count of documents on this node",
+		"indices_docs_deleted":                    "Count of deleted documents on this node",
+		"indices_store_size_bytes":                "Current size of stored index data in bytes",
+		"indices_segments_memory_bytes":           "Current memory size of segments in bytes",
+		"indices_segments_count":                  "Count of index segments on this node",
+		"process_cpu_percent":                     "Percent CPU used by process",
+		"process_mem_resident_size_bytes":         "Resident memory in use by process in bytes",
+		"process_mem_share_size_bytes":            "Shared memory in use by process in bytes",
+		"process_mem_virtual_size_bytes":          "Total virtual memory used in bytes",
+		"process_open_files_count":                "Open file descriptors",
+		"process_max_files_count":                 "Max file descriptors for process",
 	}
 	counterMetrics = map[string]string{
 		"indices_fielddata_evictions":           "Evictions from field data",
 		"indices_filter_cache_evictions":        "Evictions from filter cache",
+		"indices_query_cache_evictions":         "Evictions from query cache",
+		"indices_request_cache_evictions":       "Evictions from request cache",
 		"indices_flush_total":                   "Total flushes",
 		"indices_flush_time_ms_total":           "Cumulative flush time in milliseconds",
 		"transport_rx_packets_total":            "Count of packets received",
@@ -71,6 +76,14 @@ var (
 			help:   "Process CPU time in seconds",
 			labels: []string{"type"},
 		},
+		"thread_pool_completed_count": &VecInfo{
+			help:   "Thread Pool operations completed",
+			labels: []string{"type"},
+		},
+		"thread_pool_rejected_count": &VecInfo{
+			help:   "Thread Pool operations rejected",
+			labels: []string{"type"},
+		},
 	}
 
 	gaugeVecMetrics = map[string]*VecInfo{
@@ -94,6 +107,22 @@ var (
 			help:   "JVM memory max",
 			labels: []string{"area"},
 		},
+		"thread_pool_active_count": &VecInfo{
+			help:   "Thread Pool threads active",
+			labels: []string{"type"},
+		},
+		"thread_pool_largest_count": &VecInfo{
+			help:   "Thread Pool largest threads count",
+			labels: []string{"type"},
+		},
+		"thread_pool_queue_count": &VecInfo{
+			help:   "Thread Pool operations queued",
+			labels: []string{"type"},
+		},
+		"thread_pool_threads_count": &VecInfo{
+			help:   "Thread Pool current threads count",
+			labels: []string{"type"},
+		},
 	}
 )
 
@@ -110,7 +139,7 @@ type Exporter struct {
 	counters    map[string]*prometheus.CounterVec
 	counterVecs map[string]*prometheus.CounterVec
 
-  allNodes bool
+	allNodes bool
 
 	client *http.Client
 }
@@ -169,7 +198,7 @@ func NewExporter(uri string, timeout time.Duration, allNodes bool) *Exporter {
 		gauges:      gauges,
 		gaugeVecs:   gaugeVecs,
 
-    allNodes: allNodes,
+		allNodes: allNodes,
 
 		client: &http.Client{
 			Transport: &http.Transport{
@@ -234,7 +263,7 @@ func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
 		vec.Reset()
 	}
 
-  defer func() { ch <- e.up }()
+	defer func() { ch <- e.up }()
 
 	resp, err := e.client.Get(e.URI)
 	if err != nil {
@@ -278,6 +307,17 @@ func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
 			e.gaugeVecs["breakers_limit_size_bytes"].WithLabelValues(allStats.ClusterName, stats.Host, breaker).Set(float64(bstats.LimitSize))
 		}
 
+		// Thread Pool stats
+		for pool, pstats := range stats.ThreadPool {
+			e.counterVecs["thread_pool_completed_count"].WithLabelValues(allStats.ClusterName, stats.Host, pool).Set(float64(pstats.Completed))
+			e.counterVecs["thread_pool_rejected_count"].WithLabelValues(allStats.ClusterName, stats.Host, pool).Set(float64(pstats.Rejected))
+
+			e.gaugeVecs["thread_pool_active_count"].WithLabelValues(allStats.ClusterName, stats.Host, pool).Set(float64(pstats.Active))
+			e.gaugeVecs["thread_pool_threads_count"].WithLabelValues(allStats.ClusterName, stats.Host, pool).Set(float64(pstats.Active))
+			e.gaugeVecs["thread_pool_largest_count"].WithLabelValues(allStats.ClusterName, stats.Host, pool).Set(float64(pstats.Active))
+			e.gaugeVecs["thread_pool_queue_count"].WithLabelValues(allStats.ClusterName, stats.Host, pool).Set(float64(pstats.Active))
+		}
+
 		// JVM Memory Stats
 		e.gaugeVecs["jvm_memory_committed_bytes"].WithLabelValues(allStats.ClusterName, stats.Host, "heap").Set(float64(stats.JVM.Mem.HeapCommitted))
 		e.gaugeVecs["jvm_memory_used_bytes"].WithLabelValues(allStats.ClusterName, stats.Host, "heap").Set(float64(stats.JVM.Mem.HeapUsed))
@@ -287,9 +327,16 @@ func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
 
 		// Indices Stats
 		e.gauges["indices_fielddata_memory_size_bytes"].WithLabelValues(allStats.ClusterName, stats.Host).Set(float64(stats.Indices.FieldData.MemorySize))
+		e.counters["indices_fielddata_evictions"].WithLabelValues(allStats.ClusterName, stats.Host).Set(float64(stats.Indices.FieldData.Evictions))
+
 		e.gauges["indices_filter_cache_memory_size_bytes"].WithLabelValues(allStats.ClusterName, stats.Host).Set(float64(stats.Indices.FilterCache.MemorySize))
 		e.counters["indices_filter_cache_evictions"].WithLabelValues(allStats.ClusterName, stats.Host).Set(float64(stats.Indices.FilterCache.Evictions))
-		e.counters["indices_fielddata_evictions"].WithLabelValues(allStats.ClusterName, stats.Host).Set(float64(stats.Indices.FieldData.Evictions))
+
+		e.gauges["indices_query_cache_memory_size_bytes"].WithLabelValues(allStats.ClusterName, stats.Host).Set(float64(stats.Indices.QueryCache.MemorySize))
+		e.counters["indices_query_cache_evictions"].WithLabelValues(allStats.ClusterName, stats.Host).Set(float64(stats.Indices.QueryCache.Evictions))
+
+		e.gauges["indices_request_cache_memory_size_bytes"].WithLabelValues(allStats.ClusterName, stats.Host).Set(float64(stats.Indices.QueryCache.MemorySize))
+		e.counters["indices_request_cache_evictions"].WithLabelValues(allStats.ClusterName, stats.Host).Set(float64(stats.Indices.QueryCache.Evictions))
 
 		e.gauges["indices_docs"].WithLabelValues(allStats.ClusterName, stats.Host).Set(float64(stats.Indices.Docs.Count))
 		e.gauges["indices_docs_deleted"].WithLabelValues(allStats.ClusterName, stats.Host).Set(float64(stats.Indices.Docs.Deleted))
@@ -326,6 +373,7 @@ func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
 		e.gauges["process_mem_virtual_size_bytes"].WithLabelValues(allStats.ClusterName, stats.Host).Set(float64(stats.Process.Memory.TotalVirtual))
 		e.gauges["process_open_files_count"].WithLabelValues(allStats.ClusterName, stats.Host).Set(float64(stats.Process.OpenFD))
 
+		e.counterVecs["process_cpu_time_seconds_sum"].WithLabelValues(allStats.ClusterName, stats.Host, "total").Set(float64(stats.Process.CPU.Total / 1000))
 		e.counterVecs["process_cpu_time_seconds_sum"].WithLabelValues(allStats.ClusterName, stats.Host, "sys").Set(float64(stats.Process.CPU.Sys / 1000))
 		e.counterVecs["process_cpu_time_seconds_sum"].WithLabelValues(allStats.ClusterName, stats.Host, "user").Set(float64(stats.Process.CPU.User / 1000))
 	}
@@ -359,7 +407,6 @@ func main() {
 	)
 	flag.Parse()
 
-
 	if *esAllNodes {
 		*esURI = *esURI + "/_nodes/stats"
 	} else {
diff --git a/struct.go b/struct.go
index a37b483d..79512f8e 100644
--- a/struct.go
+++ b/struct.go
@@ -1,5 +1,7 @@
 package main
 
+import "encoding/json"
+
 // Elasticsearch Node Stats Structs
 
 type NodeStatsResponse struct {
@@ -95,17 +97,19 @@ type NodeStatsTCPResponse struct {
 }
 
 type NodeStatsIndicesResponse struct {
-	Docs        NodeStatsIndicesDocsResponse
-	Store       NodeStatsIndicesStoreResponse
-	Indexing    NodeStatsIndicesIndexingResponse
-	Merges      NodeStatsIndicesMergesResponse
-	Get         NodeStatsIndicesGetResponse
-	Search      NodeStatsIndicesSearchResponse
-	FieldData   NodeStatsIndicesFieldDataResponse
-	FilterCache NodeStatsIndicesFieldDataResponse `json:"filter_cache"`
-	Flush       NodeStatsIndicesFlushResponse
-	Segments    NodeStatsIndicesSegmentsResponse
-	Refresh     NodeStatsIndicesRefreshResponse
+	Docs         NodeStatsIndicesDocsResponse
+	Store        NodeStatsIndicesStoreResponse
+	Indexing     NodeStatsIndicesIndexingResponse
+	Merges       NodeStatsIndicesMergesResponse
+	Get          NodeStatsIndicesGetResponse
+	Search       NodeStatsIndicesSearchResponse
+	FieldData    NodeStatsIndicesCacheResponse `json:"fielddata"`
+	FilterCache  NodeStatsIndicesCacheResponse `json:"filter_cache"`
+	QueryCache   NodeStatsIndicesCacheResponse `json:"query_cache"`
+	RequestCache NodeStatsIndicesCacheResponse `json:"request_cache"`
+	Flush        NodeStatsIndicesFlushResponse
+	Segments     NodeStatsIndicesSegmentsResponse
+	Refresh      NodeStatsIndicesRefreshResponse
 }
 
 type NodeStatsIndicesDocsResponse struct {
@@ -172,18 +176,25 @@ type NodeStatsIndicesFlushResponse struct {
 	Time  int64 `json:"total_time_in_millis"`
 }
 
-type NodeStatsIndicesFieldDataResponse struct {
+type NodeStatsIndicesCacheResponse struct {
 	Evictions  int64 `json:"evictions"`
 	MemorySize int64 `json:"memory_size_in_bytes"`
+	CacheCount int64 `json:"cache_count"`
+	CacheSize  int64 `json:"cache_size"`
+	HitCount   int64 `json:"hit_count"`
+	MissCount  int64 `json:"miss_count"`
+	TotalCount int64 `json:"total_count"`
 }
 
 type NodeStatsOSResponse struct {
-	Timestamp int64                   `json:"timestamp"`
-	Uptime    int64                   `json:"uptime_in_millis"`
-	LoadAvg   []float64               `json:"load_average"`
-	CPU       NodeStatsOSCPUResponse  `json:"cpu"`
-	Mem       NodeStatsOSMemResponse  `json:"mem"`
-	Swap      NodeStatsOSSwapResponse `json:"swap"`
+	Timestamp int64 `json:"timestamp"`
+	Uptime    int64 `json:"uptime_in_millis"`
+	// LoadAvg was an array of per-cpu values pre-2.0, and is a string in 2.0
+	// Leaving this here in case we want to implement parsing logic later
+	LoadAvg json.RawMessage         `json:"load_average"`
+	CPU     NodeStatsOSCPUResponse  `json:"cpu"`
+	Mem     NodeStatsOSMemResponse  `json:"mem"`
+	Swap    NodeStatsOSSwapResponse `json:"swap"`
 }
 
 type NodeStatsOSMemResponse struct {
@@ -208,6 +219,7 @@ type NodeStatsOSCPUResponse struct {
 type NodeStatsProcessResponse struct {
 	Timestamp int64                       `json:"timestamp"`
 	OpenFD    int64                       `json:"open_file_descriptors"`
+	MaxFD     int64                       `json:"max_file_descriptors"`
 	CPU       NodeStatsProcessCPUResponse `json:"cpu"`
 	Memory    NodeStatsProcessMemResponse `json:"mem"`
 }