Skip to content

Commit 6c1f1c4

Browse files
authored
feat: adding sourceId and destinationId in pipeline info metrics (#4332)
* feat: add sourceid and destination id to pipeline_processed_events metric
1 parent cf531a4 commit 6c1f1c4

File tree

9 files changed

+266
-64
lines changed

9 files changed

+266
-64
lines changed

jobsdb/jobsdb.go

+5
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,11 @@ type JobStatusT struct {
365365
WorkspaceId string `json:"WorkspaceId"`
366366
}
367367

368+
type ConnectionDetails struct {
369+
SourceID string
370+
DestinationID string
371+
}
372+
368373
func (r *JobStatusT) sanitizeJson() {
369374
r.ErrorResponse = sanitizeJson(r.ErrorResponse)
370375
r.Parameters = sanitizeJson(r.Parameters)

router/batchrouter/handle.go

+8-1
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ import (
4545
warehouseutils "github.com/rudderlabs/rudder-server/warehouse/utils"
4646
)
4747

48+
const module = "batch_router"
49+
4850
type Handle struct {
4951
destType string
5052
// dependencies
@@ -576,6 +578,7 @@ func (brt *Handle) updateJobStatus(batchJobs *BatchedJobs, isWarehouse bool, err
576578
transformedAtMap := make(map[string]string)
577579
statusDetailsMap := make(map[string]*types.StatusDetail)
578580
jobStateCounts := make(map[string]int)
581+
jobIDConnectionDetailsMap := make(map[int64]jobsdb.ConnectionDetails)
579582
for _, job := range batchJobs.Jobs {
580583
jobState := batchJobState
581584

@@ -622,6 +625,10 @@ func (brt *Handle) updateJobStatus(batchJobs *BatchedJobs, isWarehouse bool, err
622625
abortedEvents = append(abortedEvents, job)
623626
}
624627
attemptNum := job.LastJobStatus.AttemptNum + 1
628+
jobIDConnectionDetailsMap[job.JobID] = jobsdb.ConnectionDetails{
629+
SourceID: parameters.SourceID,
630+
DestinationID: parameters.DestinationID,
631+
}
625632
status := jobsdb.JobStatusT{
626633
JobID: job.JobID,
627634
AttemptNum: attemptNum,
@@ -763,7 +770,7 @@ func (brt *Handle) updateJobStatus(batchJobs *BatchedJobs, isWarehouse bool, err
763770
if err != nil {
764771
panic(err)
765772
}
766-
brt.updateProcessedEventsMetrics(statusList)
773+
routerutils.UpdateProcessedEventsMetrics(stats.Default, module, brt.destType, statusList, jobIDConnectionDetailsMap)
767774
sendDestStatusStats(batchJobs.Connection, jobStateCounts, brt.destType, isWarehouse)
768775
}
769776

router/batchrouter/handle_async.go

+43-17
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"github.com/google/uuid"
1616
"github.com/tidwall/gjson"
1717

18+
"github.com/rudderlabs/rudder-go-kit/stats"
1819
"github.com/rudderlabs/rudder-server/jobsdb"
1920
"github.com/rudderlabs/rudder-server/router/batchrouter/asyncdestinationmanager"
2021
"github.com/rudderlabs/rudder-server/router/batchrouter/asyncdestinationmanager/common"
@@ -88,13 +89,13 @@ func getFirstAttemptAtFromErrorResponse(msg stdjson.RawMessage) time.Time {
8889
return res
8990
}
9091

91-
func (brt *Handle) prepareJobStatusList(importingList []*jobsdb.JobT, defaultStatus jobsdb.JobStatusT) ([]*jobsdb.JobStatusT, []*jobsdb.JobT) {
92+
func (brt *Handle) prepareJobStatusList(importingList []*jobsdb.JobT, defaultStatus jobsdb.JobStatusT, sourceID, destinationID string) ([]*jobsdb.JobStatusT, []*jobsdb.JobT, map[int64]jobsdb.ConnectionDetails) {
9293
var abortedJobsList []*jobsdb.JobT
9394
var statusList []*jobsdb.JobStatusT
9495
if defaultStatus.ErrorResponse == nil {
9596
defaultStatus.ErrorResponse = routerutils.EmptyPayload
9697
}
97-
98+
jobIdConnectionDetailsMap := make(map[int64]jobsdb.ConnectionDetails)
9899
for _, job := range importingList {
99100
resp := enhanceResponseWithFirstAttemptedAt(job.LastJobStatus.ErrorResponse, defaultStatus.ErrorResponse)
100101
status := jobsdb.JobStatusT{
@@ -109,6 +110,10 @@ func (brt *Handle) prepareJobStatusList(importingList []*jobsdb.JobT, defaultSta
109110
JobParameters: job.Parameters,
110111
WorkspaceId: job.WorkspaceId,
111112
}
113+
jobIdConnectionDetailsMap[job.JobID] = jobsdb.ConnectionDetails{
114+
SourceID: sourceID,
115+
DestinationID: destinationID,
116+
}
112117

113118
if defaultStatus.JobState == jobsdb.Failed.State {
114119
if brt.retryLimitReached(&status) {
@@ -118,7 +123,7 @@ func (brt *Handle) prepareJobStatusList(importingList []*jobsdb.JobT, defaultSta
118123
}
119124
statusList = append(statusList, &status)
120125
}
121-
return statusList, abortedJobsList
126+
return statusList, abortedJobsList, jobIdConnectionDetailsMap
122127
}
123128

124129
func (brt *Handle) getParamertsFromJobs(jobs []*jobsdb.JobT) map[int64]stdjson.RawMessage {
@@ -129,25 +134,28 @@ func (brt *Handle) getParamertsFromJobs(jobs []*jobsdb.JobT) map[int64]stdjson.R
129134
return parametersMap
130135
}
131136

132-
func (brt *Handle) updatePollStatusToDB(ctx context.Context, destinationID string,
133-
importingJob *jobsdb.JobT, pollResp common.PollStatusResponse,
137+
func (brt *Handle) updatePollStatusToDB(
138+
ctx context.Context,
139+
destinationID string,
140+
sourceID string,
141+
importingJob *jobsdb.JobT,
142+
pollResp common.PollStatusResponse,
134143
) ([]*jobsdb.JobStatusT, error) {
135144
var statusList []*jobsdb.JobStatusT
145+
jobIDConnectionDetailsMap := make(map[int64]jobsdb.ConnectionDetails)
136146
list, err := brt.getImportingJobs(ctx, destinationID, brt.maxEventsInABatch)
137147
if err != nil {
138148
return statusList, err
139149
}
140150
importingList := list.Jobs
141151
if pollResp.StatusCode == http.StatusOK && pollResp.Complete {
142152
if !pollResp.HasFailed && !pollResp.HasWarning {
143-
statusList, _ = brt.prepareJobStatusList(importingList, jobsdb.JobStatusT{JobState: jobsdb.Succeeded.State})
153+
statusList, _, jobIDConnectionDetailsMap = brt.prepareJobStatusList(importingList, jobsdb.JobStatusT{JobState: jobsdb.Succeeded.State}, sourceID, destinationID)
144154
if err := brt.updateJobStatuses(ctx, destinationID, importingList, importingList, statusList); err != nil {
145155
brt.logger.Errorf("[Batch Router] Failed to update job status for Dest Type %v with error %v", brt.destType, err)
146156
return statusList, err
147157
}
148158
brt.asyncSuccessfulJobCount.Count(len(statusList))
149-
brt.updateProcessedEventsMetrics(statusList)
150-
return statusList, nil
151159
} else {
152160
getUploadStatsInput := common.GetUploadStatsInput{
153161
FailedJobURLs: pollResp.FailedJobURLs,
@@ -171,6 +179,10 @@ func (brt *Handle) updatePollStatusToDB(ctx context.Context, destinationID strin
171179
successfulJobIDs := append(uploadStatsResp.Metadata.SucceededKeys, uploadStatsResp.Metadata.WarningKeys...)
172180
for _, job := range importingList {
173181
jobID := job.JobID
182+
jobIDConnectionDetailsMap[jobID] = jobsdb.ConnectionDetails{
183+
SourceID: sourceID,
184+
DestinationID: destinationID,
185+
}
174186
var status *jobsdb.JobStatusT
175187
if slices.Contains(successfulJobIDs, jobID) {
176188
warningRespString := uploadStatsResp.Metadata.WarningReasons[jobID]
@@ -226,26 +238,24 @@ func (brt *Handle) updatePollStatusToDB(ctx context.Context, destinationID strin
226238
brt.logger.Errorf("[Batch Router] Failed to update job status for Dest Type %v with error %v", brt.destType, err)
227239
return statusList, err
228240
}
229-
brt.updateProcessedEventsMetrics(statusList)
230241
}
231242
} else if pollResp.StatusCode == http.StatusBadRequest {
232-
statusList, _ := brt.prepareJobStatusList(importingList, jobsdb.JobStatusT{JobState: jobsdb.Aborted.State, ErrorResponse: misc.UpdateJSONWithNewKeyVal(routerutils.EmptyPayload, "error", "poll failed with status code 400")})
243+
statusList, _, jobIDConnectionDetailsMap = brt.prepareJobStatusList(importingList, jobsdb.JobStatusT{JobState: jobsdb.Aborted.State, ErrorResponse: misc.UpdateJSONWithNewKeyVal(routerutils.EmptyPayload, "error", "poll failed with status code 400")}, sourceID, destinationID)
233244
if err := brt.updateJobStatuses(ctx, destinationID, importingList, importingList, statusList); err != nil {
234245
brt.logger.Errorf("[Batch Router] Failed to update job status for Dest Type %v with error %v", brt.destType, err)
235246
return statusList, err
236247
}
237248
brt.asyncAbortedJobCount.Count(len(statusList))
238-
brt.updateProcessedEventsMetrics(statusList)
239249
} else {
240-
statusList, abortedJobsList := brt.prepareJobStatusList(importingList, jobsdb.JobStatusT{JobState: jobsdb.Failed.State, ErrorResponse: misc.UpdateJSONWithNewKeyVal(routerutils.EmptyPayload, "error", pollResp.Error)})
250+
var abortedJobsList []*jobsdb.JobT
251+
statusList, abortedJobsList, jobIDConnectionDetailsMap = brt.prepareJobStatusList(importingList, jobsdb.JobStatusT{JobState: jobsdb.Failed.State, ErrorResponse: misc.UpdateJSONWithNewKeyVal(routerutils.EmptyPayload, "error", pollResp.Error)}, sourceID, destinationID)
241252
if err := brt.updateJobStatuses(ctx, destinationID, importingList, abortedJobsList, statusList); err != nil {
242253
brt.logger.Errorf("[Batch Router] Failed to update job status for Dest Type %v with error %v", brt.destType, err)
243254
return statusList, err
244255
}
245256
brt.asyncFailedJobCount.Count(len(statusList))
246-
brt.updateProcessedEventsMetrics(statusList)
247257
}
248-
258+
routerutils.UpdateProcessedEventsMetrics(stats.Default, module, brt.destType, statusList, jobIDConnectionDetailsMap)
249259
return statusList, nil
250260
}
251261

@@ -280,7 +290,7 @@ func (brt *Handle) pollAsyncStatus(ctx context.Context) {
280290
if pollResp.InProgress {
281291
continue
282292
}
283-
statusList, err := brt.updatePollStatusToDB(ctx, destinationID, importingJob, pollResp)
293+
statusList, err := brt.updatePollStatusToDB(ctx, destinationID, sourceID, importingJob, pollResp)
284294
if err == nil {
285295
brt.asyncDestinationStruct[destinationID].UploadInProgress = false
286296
brt.recordAsyncDestinationDeliveryStatus(sourceID, destinationID, statusList)
@@ -576,8 +586,13 @@ func (brt *Handle) setMultipleJobStatus(asyncOutput common.AsyncUploadOutput, at
576586
workspaceID := brt.GetWorkspaceIDForDestID(asyncOutput.DestinationID)
577587
var completedJobsList []*jobsdb.JobT
578588
var statusList []*jobsdb.JobStatusT
589+
jobIDConnectionDetailsMap := make(map[int64]jobsdb.ConnectionDetails)
579590
if len(asyncOutput.ImportingJobIDs) > 0 {
580591
for _, jobId := range asyncOutput.ImportingJobIDs {
592+
jobIDConnectionDetailsMap[jobId] = jobsdb.ConnectionDetails{
593+
DestinationID: asyncOutput.DestinationID,
594+
SourceID: gjson.GetBytes(originalJobParameters[jobId], "source_id").String(),
595+
}
581596
status := jobsdb.JobStatusT{
582597
JobID: jobId,
583598
JobState: jobsdb.Importing.State,
@@ -595,6 +610,10 @@ func (brt *Handle) setMultipleJobStatus(asyncOutput common.AsyncUploadOutput, at
595610
}
596611
if len(asyncOutput.SucceededJobIDs) > 0 {
597612
for _, jobId := range asyncOutput.SucceededJobIDs {
613+
jobIDConnectionDetailsMap[jobId] = jobsdb.ConnectionDetails{
614+
DestinationID: asyncOutput.DestinationID,
615+
SourceID: gjson.GetBytes(originalJobParameters[jobId], "source_id").String(),
616+
}
598617
status := jobsdb.JobStatusT{
599618
JobID: jobId,
600619
JobState: jobsdb.Succeeded.State,
@@ -613,6 +632,10 @@ func (brt *Handle) setMultipleJobStatus(asyncOutput common.AsyncUploadOutput, at
613632
}
614633
if len(asyncOutput.FailedJobIDs) > 0 {
615634
for _, jobId := range asyncOutput.FailedJobIDs {
635+
jobIDConnectionDetailsMap[jobId] = jobsdb.ConnectionDetails{
636+
DestinationID: asyncOutput.DestinationID,
637+
SourceID: gjson.GetBytes(originalJobParameters[jobId], "source_id").String(),
638+
}
616639
resp := misc.UpdateJSONWithNewKeyVal(routerutils.EmptyPayload, "error", asyncOutput.FailedReason)
617640
status := jobsdb.JobStatusT{
618641
JobID: jobId,
@@ -641,6 +664,10 @@ func (brt *Handle) setMultipleJobStatus(asyncOutput common.AsyncUploadOutput, at
641664
}
642665
if len(asyncOutput.AbortJobIDs) > 0 {
643666
for _, jobId := range asyncOutput.AbortJobIDs {
667+
jobIDConnectionDetailsMap[jobId] = jobsdb.ConnectionDetails{
668+
DestinationID: asyncOutput.DestinationID,
669+
SourceID: gjson.GetBytes(originalJobParameters[jobId], "source_id").String(),
670+
}
644671
status := jobsdb.JobStatusT{
645672
JobID: jobId,
646673
JobState: jobsdb.Aborted.State,
@@ -696,14 +723,13 @@ func (brt *Handle) setMultipleJobStatus(asyncOutput common.AsyncUploadOutput, at
696723
if err != nil {
697724
panic(err)
698725
}
699-
brt.updateProcessedEventsMetrics(statusList)
726+
routerutils.UpdateProcessedEventsMetrics(stats.Default, module, brt.destType, statusList, jobIDConnectionDetailsMap)
700727
rmetrics.DecreasePendingEvents(
701728
"batch_rt",
702729
workspaceID,
703730
brt.destType,
704731
float64(len(completedJobsList)),
705732
)
706-
707733
if attempted {
708734
var sourceID string
709735
if len(statusList) > 0 {

router/batchrouter/handle_observability.go

-22
Original file line numberDiff line numberDiff line change
@@ -229,28 +229,6 @@ func (brt *Handle) updateRudderSourcesStats(
229229
return nil
230230
}
231231

232-
func (brt *Handle) updateProcessedEventsMetrics(statusList []*jobsdb.JobStatusT) {
233-
eventsPerStateAndCode := map[string]map[string]int{}
234-
for i := range statusList {
235-
state := statusList[i].JobState
236-
code := statusList[i].ErrorCode
237-
if _, ok := eventsPerStateAndCode[state]; !ok {
238-
eventsPerStateAndCode[state] = map[string]int{}
239-
}
240-
eventsPerStateAndCode[state][code]++
241-
}
242-
for state, codes := range eventsPerStateAndCode {
243-
for code, count := range codes {
244-
stats.Default.NewTaggedStat(`pipeline_processed_events`, stats.CountType, stats.Tags{
245-
"module": "batch_router",
246-
"destType": brt.destType,
247-
"state": state,
248-
"code": code,
249-
}).Count(count)
250-
}
251-
}
252-
}
253-
254232
// pipelineDelayStats reports the delay of the pipeline as a range:
255233
//
256234
// - max - time elapsed since the first job was created

router/batchrouter/worker.go

+6-1
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,14 @@ func (w *worker) processJobAsync(jobsWg *sync.WaitGroup, destinationJobs *Destin
7070
var drainList []*jobsdb.JobStatusT
7171
var drainJobList []*jobsdb.JobT
7272
drainStatsbyDest := make(map[string]*routerutils.DrainStats)
73+
jobIDConnectionDetailsMap := make(map[int64]jobsdb.ConnectionDetails)
7374

7475
jobsBySource := make(map[string][]*jobsdb.JobT)
7576
for _, job := range destinationJobs.jobs {
77+
jobIDConnectionDetailsMap[job.JobID] = jobsdb.ConnectionDetails{
78+
SourceID: gjson.GetBytes(job.Parameters, "source_id").String(),
79+
DestinationID: destWithSources.Destination.ID,
80+
}
7681
if drain, reason := brt.drainer.Drain(
7782
job,
7883
); drain {
@@ -153,7 +158,7 @@ func (w *worker) processJobAsync(jobsWg *sync.WaitGroup, destinationJobs *Destin
153158
if err != nil {
154159
panic(err)
155160
}
156-
brt.updateProcessedEventsMetrics(statusList)
161+
routerutils.UpdateProcessedEventsMetrics(stats.Default, module, brt.destType, statusList, jobIDConnectionDetailsMap)
157162
for destID, destDrainStat := range drainStatsbyDest {
158163
stats.Default.NewTaggedStat("drained_events", stats.CountType, stats.Tags{
159164
"destType": brt.destType,

router/handle.go

+8-1
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ import (
4545
utilTypes "github.com/rudderlabs/rudder-server/utils/types"
4646
)
4747

48+
const module = "router"
49+
4850
// Handle is the handle to this module.
4951
type Handle struct {
5052
// external dependencies
@@ -324,6 +326,7 @@ func (rt *Handle) commitStatusList(workerJobStatuses *[]workerJobStatus) {
324326
var completedJobsList []*jobsdb.JobT
325327
var statusList []*jobsdb.JobStatusT
326328
var routerAbortedJobs []*jobsdb.JobT
329+
jobIDConnectionDetailsMap := make(map[int64]jobsdb.ConnectionDetails)
327330
for _, workerJobStatus := range *workerJobStatuses {
328331
var parameters routerutils.JobParameters
329332
err := json.Unmarshal(workerJobStatus.job.Parameters, &parameters)
@@ -337,6 +340,10 @@ func (rt *Handle) commitStatusList(workerJobStatuses *[]workerJobStatus) {
337340
workspaceID := workerJobStatus.status.WorkspaceId
338341
eventName := gjson.GetBytes(workerJobStatus.job.Parameters, "event_name").String()
339342
eventType := gjson.GetBytes(workerJobStatus.job.Parameters, "event_type").String()
343+
jobIDConnectionDetailsMap[workerJobStatus.job.JobID] = jobsdb.ConnectionDetails{
344+
SourceID: parameters.SourceID,
345+
DestinationID: parameters.DestinationID,
346+
}
340347
key := fmt.Sprintf("%s:%s:%s:%s:%s:%s:%s", parameters.SourceID, parameters.DestinationID, parameters.SourceJobRunID, workerJobStatus.status.JobState, workerJobStatus.status.ErrorCode, eventName, eventType)
341348
_, ok := connectionDetailsMap[key]
342349
if !ok {
@@ -454,7 +461,7 @@ func (rt *Handle) commitStatusList(workerJobStatuses *[]workerJobStatus) {
454461
if err != nil {
455462
panic(err)
456463
}
457-
rt.updateProcessedEventsMetrics(statusList)
464+
routerutils.UpdateProcessedEventsMetrics(stats.Default, module, rt.destType, statusList, jobIDConnectionDetailsMap)
458465
for workspace, jobCount := range routerWorkspaceJobStatusCount {
459466
rmetrics.DecreasePendingEvents(
460467
"rt",

router/handle_observability.go

-22
Original file line numberDiff line numberDiff line change
@@ -102,28 +102,6 @@ func (rt *Handle) updateRudderSourcesStats(
102102
return err
103103
}
104104

105-
func (rt *Handle) updateProcessedEventsMetrics(statusList []*jobsdb.JobStatusT) {
106-
eventsPerStateAndCode := map[string]map[string]int{}
107-
for i := range statusList {
108-
state := statusList[i].JobState
109-
code := statusList[i].ErrorCode
110-
if _, ok := eventsPerStateAndCode[state]; !ok {
111-
eventsPerStateAndCode[state] = map[string]int{}
112-
}
113-
eventsPerStateAndCode[state][code]++
114-
}
115-
for state, codes := range eventsPerStateAndCode {
116-
for code, count := range codes {
117-
stats.Default.NewTaggedStat(`pipeline_processed_events`, stats.CountType, stats.Tags{
118-
"module": "router",
119-
"destType": rt.destType,
120-
"state": state,
121-
"code": code,
122-
}).Count(count)
123-
}
124-
}
125-
}
126-
127105
func (rt *Handle) sendRetryStoreStats(attempt int) {
128106
rt.logger.Warnf("Timeout during store jobs in router module, attempt %d", attempt)
129107
stats.Default.NewTaggedStat("jobsdb_store_timeout", stats.CountType, stats.Tags{"attempt": fmt.Sprint(attempt), "module": "router"}).Count(1)

0 commit comments

Comments
 (0)