Skip to content

Commit

Permalink
Add Allocation Service Metrics (#414)
Browse files Browse the repository at this point in the history
* added allocation time taken metric

* fixed syntax errors

* added grafana panel for allocations time taken metric

* Added metric for allocation retries.

* added grafana widgets for game state based metrics

* Added 429, 404, and 500 error metrics.

* Added 409 error metric.

* addressing comments

* Revert dashboard.json

Co-authored-by: abbasahmed <[email protected]>
Co-authored-by: abbasahmed <[email protected]>
Co-authored-by: ghov <[email protected]>
Co-authored-by: abbasahmed <[email protected]>
  • Loading branch information
5 people committed Oct 12, 2022
1 parent 1fbc7b5 commit c6eed87
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 0 deletions.
11 changes: 11 additions & 0 deletions pkg/operator/controllers/allocation_api_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,8 @@ func (s *AllocationApiServer) handleAllocationRequest(w http.ResponseWriter, r *
return
}

timeToAllocateStartTime := time.Now()

// allocation using the heap
for i := 0; i < allocationTries; i++ {
if i > 0 {
Expand All @@ -289,6 +291,7 @@ func (s *AllocationApiServer) handleAllocationRequest(w http.ResponseWriter, r *
if gs == nil {
// pop from queue returned nil, this means no more game servers in this build
tooManyRequestsError(w, s.logger, fmt.Errorf("not enough standingBy"), "there are not enough standingBy servers")
Allocations429ErrorsCounter.WithLabelValues(args.BuildID).Inc()
return
}

Expand Down Expand Up @@ -316,10 +319,13 @@ func (s *AllocationApiServer) handleAllocationRequest(w http.ResponseWriter, r *
if err != nil {
if apierrors.IsConflict(err) {
s.logger.Info("conflict error patching game server", "error", err, "sessionID", args.SessionID, "buildID", args.BuildID, "retry", i)
Allocations409ErrorsCounter.WithLabelValues(gs2.Labels[LabelBuildName]).Inc()
} else if apierrors.IsNotFound(err) {
s.logger.Info("error not found patching game server", "error", err, "sessionID", args.SessionID, "buildID", args.BuildID, "retry", i)
Allocations404ErrorsCounter.WithLabelValues(gs2.Labels[LabelBuildName]).Inc()
} else {
s.logger.Error(err, "uknown error patching game server", "sessionID", args.SessionID, "buildID", args.BuildID, "retry", i)
Allocations500ErrorsCounter.WithLabelValues(gs2.Labels[LabelBuildName]).Inc()
}
// in case of any error, trigger a reconciliation for this GameServer object
// so it's re-added to the queue
Expand All @@ -339,10 +345,15 @@ func (s *AllocationApiServer) handleAllocationRequest(w http.ResponseWriter, r *
err = json.NewEncoder(w).Encode(rs)
if err != nil {
internalServerError(w, s.logger, err, "encode json response")
Allocations500ErrorsCounter.WithLabelValues(gs2.Labels[LabelBuildName]).Inc()
return
}
s.logger.Info("Allocated GameServer", "name", gs2.Name, "sessionID", args.SessionID, "buildID", args.BuildID, "ip", gs2.Status.PublicIP, "ports", gs2.Status.Ports)
AllocationsCounter.WithLabelValues(gs2.Labels[LabelBuildName]).Inc()
if i > 0 {
AllocationsRetriesCounter.WithLabelValues(gs2.Labels[LabelBuildName]).Inc()
}
AllocationsTimeTakenDuration.WithLabelValues(gs2.Labels[LabelBuildName]).Set(float64(time.Since(timeToAllocateStartTime).Milliseconds()))
return
}

Expand Down
48 changes: 48 additions & 0 deletions pkg/operator/controllers/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,4 +72,52 @@ var (
},
[]string{"BuildName"},
)
AllocationsTimeTakenDuration = registry.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "thundernetes",
Name: "allocations_time_taken_duration",
Help: "Average time it took to allocate a GameServer",
},
[]string{"BuildName"},
)
AllocationsRetriesCounter = registry.NewCounterVec(
prometheus.CounterOpts{
Namespace: "thundernetes",
Name: "allocations_retried",
Help: "The number of times allocation had to be retried",
},
[]string{"BuildName"},
)
Allocations429ErrorsCounter = registry.NewCounterVec(
prometheus.CounterOpts{
Namespace: "thundernetes",
Name: "allocations_429",
Help: "The number of 429 (too many requests) errors during allocation",
},
[]string{"BuildName"},
)
Allocations404ErrorsCounter = registry.NewCounterVec(
prometheus.CounterOpts{
Namespace: "thundernetes",
Name: "allocations_404",
Help: "The number of 404 (not found) errors during allocation",
},
[]string{"BuildName"},
)
Allocations500ErrorsCounter = registry.NewCounterVec(
prometheus.CounterOpts{
Namespace: "thundernetes",
Name: "allocations_500",
Help: "The number of 500 (internal) errors during allocation",
},
[]string{"BuildName"},
)
Allocations409ErrorsCounter = registry.NewCounterVec(
prometheus.CounterOpts{
Namespace: "thundernetes",
Name: "allocations_409",
Help: "The number of 409 (request conflict) errors during allocation",
},
[]string{"BuildName"},
)
)

0 comments on commit c6eed87

Please sign in to comment.