-
Notifications
You must be signed in to change notification settings - Fork 2.5k
/
Copy pathclient.go
354 lines (306 loc) · 11.1 KB
/
client.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
// Copyright The OpenTelemetry Authors
// SPDX-License-Identifier: Apache-2.0
package flinkmetricsreceiver // import "github.com/open-telemetry/opentelemetry-collector-contrib/receiver/flinkmetricsreceiver"
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"strconv"
"strings"
"go.opentelemetry.io/collector/component"
"go.uber.org/zap"
"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/flinkmetricsreceiver/internal/models"
)
// The API endpoints required to collect metrics.
const (
// jobmanagerMetricEndpoint gets jobmanager metrics.
jobmanagerMetricEndpoint = "/jobmanager/metrics"
// taskmanagersEndpoint gets taskmanager IDs.
taskmanagersEndpoint = "/taskmanagers"
// taskmanagersMetricEndpoint gets taskmanager using a taskmanager ID.
taskmanagersMetricEndpoint = "/taskmanagers/%s/metrics"
// jobsEndpoint gets job IDs.
jobsEndpoint = "/jobs"
// jobsOverviewEndpoint gets job IDs with associated Job names.
jobsOverviewEndpoint = "/jobs/overview"
// jobsWithIDEndpoint gets vertex IDs using a job ID.
jobsWithIDEndpoint = "/jobs/%s"
// jobsMetricEndpoint gets job metrics using a job ID.
jobsMetricEndpoint = "/jobs/%s/metrics"
// verticesEndpoint gets subtask index's using a job and vertex ID.
verticesEndpoint = "/jobs/%s/vertices/%s"
// subtaskMetricEndpoint gets subtask metrics using a job ID, vertex ID and subtask index.
subtaskMetricEndpoint = "/jobs/%s/vertices/%s/subtasks/%v/metrics"
)
type client interface {
GetJobmanagerMetrics(ctx context.Context) (*models.JobmanagerMetrics, error)
GetTaskmanagersMetrics(ctx context.Context) ([]*models.TaskmanagerMetrics, error)
GetJobsMetrics(ctx context.Context) ([]*models.JobMetrics, error)
GetSubtasksMetrics(ctx context.Context) ([]*models.SubtaskMetrics, error)
}
type flinkClient struct {
client *http.Client
hostEndpoint string
hostName string
logger *zap.Logger
}
func newClient(ctx context.Context, cfg *Config, host component.Host, settings component.TelemetrySettings, logger *zap.Logger) (client, error) {
httpClient, err := cfg.ToClient(ctx, host, settings)
if err != nil {
return nil, fmt.Errorf("failed to create HTTP Client: %w", err)
}
hostName, err := getHostname()
if err != nil {
return nil, err
}
return &flinkClient{
client: httpClient,
hostName: hostName,
hostEndpoint: cfg.Endpoint,
logger: logger,
}, nil
}
func (c *flinkClient) get(ctx context.Context, path string) ([]byte, error) {
// Construct endpoint and create request
url := c.hostEndpoint + path
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, http.NoBody)
if err != nil {
return nil, fmt.Errorf("failed to create get request for path %s: %w", path, err)
}
// Make request
resp, err := c.client.Do(req)
if err != nil {
return nil, fmt.Errorf("failed to make http request: %w", err)
}
// Defer body close
defer func() {
if closeErr := resp.Body.Close(); closeErr != nil {
c.logger.Warn("failed to close response body", zap.Error(closeErr))
}
}()
// Check for OK status code
if resp.StatusCode != http.StatusOK {
c.logger.Debug("flink API non-200", zap.Error(err), zap.Int("status_code", resp.StatusCode))
// Attempt to extract the error payload
payloadData, err := io.ReadAll(resp.Body)
if err != nil {
c.logger.Debug("failed to read payload error message", zap.Error(err))
} else {
c.logger.Debug("flink API Error", zap.ByteString("api_error", payloadData))
}
return nil, fmt.Errorf("non 200 code returned %d", resp.StatusCode)
}
return io.ReadAll(resp.Body)
}
// getMetrics makes a request to a metric endpoint to get the metric names, the another request building a query to get the metric values.
func (c *flinkClient) getMetrics(ctx context.Context, path string) (*models.MetricsResponse, error) {
// Get the metric names
var metrics *models.MetricsResponse
body, err := c.get(ctx, path)
if err != nil {
c.logger.Debug("failed to retrieve metric names", zap.Error(err))
return nil, err
}
// Populates the metric names
err = json.Unmarshal(body, &metrics)
if err != nil {
return nil, fmt.Errorf("failed to unmarshal response body: %w", err)
}
// Construct a get query parameter using comma-separated list of string values to select specific metrics
query := make([]string, len(*metrics))
for i, metricName := range *metrics {
query[i] = metricName.ID
}
metricsPath := path + "?get=" + strings.Join(query, ",")
// Get the metric values using the query
body, err = c.get(ctx, metricsPath)
if err != nil {
c.logger.Debug("failed to retrieve metric values", zap.Error(err))
return nil, err
}
// Populates metric values
err = json.Unmarshal(body, &metrics)
if err != nil {
return nil, fmt.Errorf("failed to unmarshal response body: %w", err)
}
return metrics, nil
}
// GetJobManagerMetrics gets the jobmanager metrics.
func (c *flinkClient) GetJobmanagerMetrics(ctx context.Context) (*models.JobmanagerMetrics, error) {
// Get the metric names and values for jobmanager
metrics, err := c.getMetrics(ctx, jobmanagerMetricEndpoint)
if err != nil {
return nil, err
}
// Add a hostname used to identify between multiple jobmanager instances
return &models.JobmanagerMetrics{
Host: c.hostName,
Metrics: *metrics,
}, nil
}
// GetTaskmanagersMetrics gets the Taskmanager metrics for each taskmanager.
func (c *flinkClient) GetTaskmanagersMetrics(ctx context.Context) ([]*models.TaskmanagerMetrics, error) {
// Get the taskmanager id list
var taskmanagerIDs *models.TaskmanagerIDsResponse
body, err := c.get(ctx, taskmanagersEndpoint)
if err != nil {
c.logger.Debug("failed to retrieve taskmanager IDs", zap.Error(err))
return nil, err
}
// Populates taskmanager id names
err = json.Unmarshal(body, &taskmanagerIDs)
if err != nil {
return nil, fmt.Errorf("failed to unmarshal response body: %w", err)
}
// Get taskmanager metrics for each taskmanager id
return c.getTaskmanagersMetricsByIDs(ctx, taskmanagerIDs)
}
// getTaskmanagersMetricsByIDs gets taskmanager metrics for each task manager id.
func (c *flinkClient) getTaskmanagersMetricsByIDs(ctx context.Context, taskmanagerIDs *models.TaskmanagerIDsResponse) ([]*models.TaskmanagerMetrics, error) {
taskmanagerInstances := make([]*models.TaskmanagerMetrics, len(taskmanagerIDs.Taskmanagers))
for i, taskmanager := range taskmanagerIDs.Taskmanagers {
query := fmt.Sprintf(taskmanagersMetricEndpoint, taskmanager.ID)
metrics, err := c.getMetrics(ctx, query)
if err != nil {
return nil, err
}
taskmanagerInstance := &models.TaskmanagerMetrics{
TaskmanagerID: getTaskmanagerID(taskmanager.ID),
Host: getTaskmanagerHost(taskmanager.ID),
Metrics: *metrics,
}
taskmanagerInstances[i] = taskmanagerInstance
}
return taskmanagerInstances, nil
}
// GetJobsMetrics gets the job metrics for each job.
func (c *flinkClient) GetJobsMetrics(ctx context.Context) ([]*models.JobMetrics, error) {
// Get the job id and name list
var jobIDs *models.JobOverviewResponse
body, err := c.get(ctx, jobsOverviewEndpoint)
if err != nil {
c.logger.Debug("failed to retrieve job IDs", zap.Error(err))
return nil, err
}
// Populates job id and names
err = json.Unmarshal(body, &jobIDs)
if err != nil {
return nil, fmt.Errorf("failed to unmarshal response body: %w", err)
}
// Get job metrics for each job id
return c.getJobsMetricsByIDs(ctx, jobIDs)
}
// getJobsMetricsByIDs gets jobs metrics for each job id.
func (c *flinkClient) getJobsMetricsByIDs(ctx context.Context, jobIDs *models.JobOverviewResponse) ([]*models.JobMetrics, error) {
jobInstances := make([]*models.JobMetrics, len(jobIDs.Jobs))
for i, job := range jobIDs.Jobs {
query := fmt.Sprintf(jobsMetricEndpoint, job.Jid)
metrics, err := c.getMetrics(ctx, query)
if err != nil {
return nil, err
}
jobInstance := models.JobMetrics{
Host: c.hostName,
JobName: job.Name,
Metrics: *metrics,
}
jobInstances[i] = &jobInstance
}
return jobInstances, nil
}
// GetSubtasksMetrics gets subtask metrics for each job id, vertex id and subtask index.
func (c *flinkClient) GetSubtasksMetrics(ctx context.Context) ([]*models.SubtaskMetrics, error) {
// Get the job id's
var jobsResponse *models.JobsResponse
body, err := c.get(ctx, jobsEndpoint)
if err != nil {
c.logger.Debug("failed to retrieve job IDs", zap.Error(err))
return nil, err
}
// Populates the job id
err = json.Unmarshal(body, &jobsResponse)
if err != nil {
return nil, fmt.Errorf("failed to unmarshal response body: %w", err)
}
return c.getSubtasksMetricsByIDs(ctx, jobsResponse)
}
// getSubtasksMetricsByIDs gets subtask metrics for each job id, vertex id and subtask index.
func (c *flinkClient) getSubtasksMetricsByIDs(ctx context.Context, jobsResponse *models.JobsResponse) ([]*models.SubtaskMetrics, error) {
var subtaskInstances []*models.SubtaskMetrics
// Get vertices for each job
for _, job := range jobsResponse.Jobs {
var jobsWithIDResponse *models.JobsWithIDResponse
query := fmt.Sprintf(jobsWithIDEndpoint, job.ID)
body, err := c.get(ctx, query)
if err != nil {
c.logger.Debug("failed to retrieve job with ID", zap.Error(err))
return nil, err
}
// Populates the job response with vertices info
err = json.Unmarshal(body, &jobsWithIDResponse)
if err != nil {
return nil, fmt.Errorf("failed to unmarshal response body: %w", err)
}
// Gets subtask info for each vertex id
for _, vertex := range jobsWithIDResponse.Vertices {
var vertexResponse *models.VerticesResponse
query := fmt.Sprintf(verticesEndpoint, job.ID, vertex.ID)
body, err = c.get(ctx, query)
if err != nil {
c.logger.Debug("failed to retrieve vertex with ID", zap.Error(err))
return nil, err
}
// Populates the vertex response with subtask info
err = json.Unmarshal(body, &vertexResponse)
if err != nil {
return nil, fmt.Errorf("failed to unmarshal response body: %w", err)
}
// Gets subtask metrics for each vertex id
for _, subtask := range vertexResponse.Subtasks {
query := fmt.Sprintf(subtaskMetricEndpoint, job.ID, vertex.ID, subtask.Subtask)
subtaskMetrics, err := c.getMetrics(ctx, query)
if err != nil {
c.logger.Debug("failed to retrieve subtasks metrics", zap.Error(err))
return nil, err
}
// Stores subtask info with additional attribute values to uniquely identify metrics
subtaskInstances = append(subtaskInstances,
&models.SubtaskMetrics{
Host: getTaskmanagerHost(subtask.TaskmanagerID),
TaskmanagerID: getTaskmanagerID(subtask.TaskmanagerID),
JobName: jobsWithIDResponse.Name,
TaskName: vertex.Name,
SubtaskIndex: strconv.Itoa(subtask.Subtask),
Metrics: *subtaskMetrics,
})
}
}
}
return subtaskInstances, nil
}
// Override for testing
var osHostname = os.Hostname
func getHostname() (string, error) {
host, err := osHostname()
if err != nil {
return "", err
}
return host, nil
}
// Override for testing
var taskmanagerHost = strings.Split
func getTaskmanagerHost(id string) string {
host := taskmanagerHost(id, ":")
return host[0]
}
func reflect(s string) string {
return s
}
// Override for testing
var taskmanagerID = reflect
func getTaskmanagerID(id string) string {
return taskmanagerID(id)
}