From 23f57766dac30587257fe8d4b4ef075da5963a4e Mon Sep 17 00:00:00 2001 From: Benny Zlotnik Date: Sun, 29 Mar 2026 10:31:06 +0300 Subject: [PATCH] optimizations and improvements to build process - cleanups and small performance improvements Signed-off-by: Benny Zlotnik Assited-by: claude-opus-4.6 --- cmd/main.go | 2 +- config/manager/manager.yaml | 2 +- internal/common/tasks/scripts/build_image.sh | 227 ++++++++++------ internal/common/tasks/scripts/common.sh | 5 +- internal/common/tasks/tasks.go | 11 +- internal/controller/imagebuild/controller.go | 46 ++++ internal/controller/imagebuild/metrics.go | 59 ++++ .../controller/imagebuild/metrics_test.go | 255 ++++++++++++++++++ .../controller/operatorconfig/resources.go | 7 +- 9 files changed, 523 insertions(+), 91 deletions(-) create mode 100644 internal/controller/imagebuild/metrics.go create mode 100644 internal/controller/imagebuild/metrics_test.go diff --git a/cmd/main.go b/cmd/main.go index 3c85a2db..990de2e1 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -85,7 +85,7 @@ func main() { var enableHTTP2 bool var mode string var tlsOpts []func(*tls.Config) - flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+ + flag.StringVar(&metricsAddr, "metrics-bind-address", ":8443", "The address the metrics endpoint binds to. "+ "Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.") flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") flag.BoolVar(&enableLeaderElection, "leader-elect", false, diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml index f1dc4abd..b117539b 100644 --- a/config/manager/manager.yaml +++ b/config/manager/manager.yaml @@ -80,7 +80,7 @@ spec: - --mode=all - --leader-elect - --health-probe-bind-address=:8081 - - --metrics-bind-address=0 + - --metrics-bind-address=:8443 image: controller:latest name: manager env: diff --git a/internal/common/tasks/scripts/build_image.sh b/internal/common/tasks/scripts/build_image.sh index 837f1ecd..34a2d873 100644 --- a/internal/common/tasks/scripts/build_image.sh +++ b/internal/common/tasks/scripts/build_image.sh @@ -2,6 +2,7 @@ # Initialize optimizations WORKSPACE_PATH="$(workspaces.shared-workspace.path)" +BUILD_START_TIME=$(date +%s) detect_stat_command # Make the internal registry trusted @@ -259,21 +260,22 @@ fi # Record the effective builder image used for annotation echo -n "${BUILDER_IMAGE:-}" > /tekton/results/builder-image -# Capture AIB version and pinned image reference for disk artifact annotations -AIB_VERSION=$(aib --version 2>&1 | head -1 | tr -d '\r\n' | sed 's/^aib //' || echo "") -echo -n "${AIB_VERSION}" > /tekton/results/aib-version - +# Start AIB metadata capture in the background (runs in parallel with the build) AIB_IMAGE_REF="$(params.automotive-image-builder)" -AIB_IMAGE_DIGEST=$(skopeo inspect --format '{{.Digest}}' "docker://${AIB_IMAGE_REF}" 2>/dev/null || echo "") -if [ -n "$AIB_IMAGE_DIGEST" ]; then - AIB_IMAGE_BASE=$(echo "$AIB_IMAGE_REF" | sed 's/:[^/]*$//') - AIB_IMAGE_PINNED="${AIB_IMAGE_BASE}@${AIB_IMAGE_DIGEST}" - echo "AIB image pinned reference: $AIB_IMAGE_PINNED" -else - AIB_IMAGE_PINNED="$AIB_IMAGE_REF" - echo "Could not resolve AIB image digest, recording reference as-is" -fi -echo -n "${AIB_IMAGE_PINNED}" > /tekton/results/automotive-image-builder +( + aib --version 2>&1 | head -1 | tr -d '\r\n' | sed 's/^aib //' > /tmp/aib-version.txt 2>/dev/null || true + AIB_DIGEST=$(skopeo inspect --format '{{.Digest}}' "docker://${AIB_IMAGE_REF}" 2>/dev/null || echo "") + if [ -n "$AIB_DIGEST" ]; then + case "$AIB_IMAGE_REF" in + *@*) echo -n "$AIB_IMAGE_REF" > /tmp/aib-pinned.txt ;; + *) AIB_BASE=$(echo "$AIB_IMAGE_REF" | sed 's/:[^/]*$//') + echo -n "${AIB_BASE}@${AIB_DIGEST}" > /tmp/aib-pinned.txt ;; + esac + else + echo -n "$AIB_IMAGE_REF" > /tmp/aib-pinned.txt + fi +) & +AIB_METADATA_PID=$! # Set up builder image if needed (consolidated logic) declare -a BUILD_CONTAINER_ARGS=() @@ -319,29 +321,57 @@ if [ "$(params.use-persistent-cache)" = "true" ]; then COMMON_BUILD_ARGS+=(--cache-max-size=unlimited) fi +AIB_INVOKE_TIME=$(date +%s) +echo "⏱ Setup phase: $((AIB_INVOKE_TIME - BUILD_START_TIME))s" + case "$BUILD_MODE" in bootc) - # Build bootc container and optionally disk image in a single command - # aib build takes: manifest out [disk] where disk is optional - declare -a DISK_OUTPUT_ARGS=() - if [ "$BUILD_DISK_IMAGE" = "true" ]; then - DISK_OUTPUT_ARGS=("/output/${exportFile}") + # When both container push and disk build are needed, split into two phases + # so container push can overlap with disk creation: + # Phase 1: aib build (container only, no disk) + # Phase 2: container push (background) + aib to-disk-image (foreground, parallel) + # When only one is needed, use the single aib build command. + SPLIT_BUILD="false" + if [ -n "$CONTAINER_PUSH" ] && [ "$BUILD_DISK_IMAGE" = "true" ]; then + SPLIT_BUILD="true" + fi + + if [ "$SPLIT_BUILD" = "true" ]; then + # Phase 1: Build container only (no disk output) + declare -a AIB_CMD=( + aib --verbose build + --distro "$(params.distro)" + --target "$(params.target)" + "--arch=${arch}" + "${COMMON_BUILD_ARGS[@]}" + "${BUILD_CONTAINER_ARGS[@]}" + "${CUSTOM_DEFS_ARGS[@]}" + "${AIB_EXTRA_ARGS[@]}" + "$MANIFEST_FILE" + "$BOOTC_CONTAINER_NAME" + ) + else + declare -a DISK_OUTPUT_ARGS=() + if [ "$BUILD_DISK_IMAGE" = "true" ]; then + DISK_OUTPUT_ARGS=("/output/${exportFile}") + fi + + declare -a AIB_CMD=( + aib --verbose build + --distro "$(params.distro)" + --target "$(params.target)" + "--arch=${arch}" + "${COMMON_BUILD_ARGS[@]}" + "${FORMAT_ARGS[@]}" + "${BUILD_CONTAINER_ARGS[@]}" + "${CUSTOM_DEFS_ARGS[@]}" + "${AIB_EXTRA_ARGS[@]}" + "$MANIFEST_FILE" + "$BOOTC_CONTAINER_NAME" + "${DISK_OUTPUT_ARGS[@]}" + ) fi - declare -a AIB_CMD=( - aib --verbose build - --distro "$(params.distro)" - --target "$(params.target)" - "--arch=${arch}" - "${COMMON_BUILD_ARGS[@]}" - "${FORMAT_ARGS[@]}" - "${BUILD_CONTAINER_ARGS[@]}" - "${CUSTOM_DEFS_ARGS[@]}" - "${AIB_EXTRA_ARGS[@]}" - "$MANIFEST_FILE" - "$BOOTC_CONTAINER_NAME" - "${DISK_OUTPUT_ARGS[@]}" - ) AIB_COMMAND=$(printf '%q ' "${AIB_CMD[@]}" | sed 's/ $//') echo -n "$AIB_COMMAND" > /tekton/results/aib-command @@ -349,26 +379,34 @@ case "$BUILD_MODE" in emit_progress "Building image" "$STEP_BUILD" "$PROGRESS_TOTAL" "${AIB_CMD[@]}" + CONTAINER_PUSH_PID="" if [ -n "$CONTAINER_PUSH" ]; then emit_progress "Pushing container" "$((STEP_BUILD + 1))" "$PROGRESS_TOTAL" - PUSH_SRC="containers-storage:$BOOTC_CONTAINER_NAME" if [ -z "$BUILDER_IMAGE" ]; then echo "Error: BUILDER_IMAGE is empty; cannot annotate bootc container" exit 1 fi - # Add builder-image as manifest annotation + config label. - echo "Annotating bootc container with builder image: $BUILDER_IMAGE" - OCI_DIR="/tmp/bootc-oci" - rm -rf "$OCI_DIR" - skopeo copy "$PUSH_SRC" "oci:${OCI_DIR}:latest" - - echo "AIB version for annotation: ${AIB_VERSION:-unknown}" - echo "AIB image for annotation: ${AIB_IMAGE_PINNED}" - - # Use inline Python for OCI annotation (extracted from original embedded code) - python3 - "$OCI_DIR" "$BUILDER_IMAGE" "$AIB_VERSION" "$AIB_IMAGE_PINNED" "$AIB_COMMAND" <<'PYEOF' + # Annotate + push container to registry in the background. + # This overlaps with disk compression, saving wall-clock time. + ( + PUSH_START=$(date +%s) + PUSH_SRC="containers-storage:$BOOTC_CONTAINER_NAME" + OCI_DIR="/tmp/bootc-oci" + rm -rf "$OCI_DIR" + skopeo copy "$PUSH_SRC" "oci:${OCI_DIR}:latest" + COPY_DONE=$(date +%s) + echo "⏱ Container copy to OCI: $((COPY_DONE - PUSH_START))s" + + # Wait for AIB metadata to be available for annotations + # (can't use `wait` here — AIB_METADATA_PID is a sibling, not a child of this subshell) + while kill -0 "$AIB_METADATA_PID" 2>/dev/null; do sleep 1; done + _AIB_VERSION=$(cat /tmp/aib-version.txt 2>/dev/null || echo "") + _AIB_IMAGE_PINNED=$(cat /tmp/aib-pinned.txt 2>/dev/null || echo "$AIB_IMAGE_REF") + + echo "Annotating bootc container with builder image: $BUILDER_IMAGE" + python3 - "$OCI_DIR" "$BUILDER_IMAGE" "$_AIB_VERSION" "$_AIB_IMAGE_PINNED" "$AIB_COMMAND" <<'PYEOF' import json, sys, hashlib, os from pathlib import Path @@ -415,18 +453,37 @@ manifest_entry["digest"], manifest_entry["size"] = update_blob(oci_dir, manifest index_path.write_text(json.dumps(index, indent=2)) PYEOF - PUSH_SRC="oci:${OCI_DIR}:latest" - - echo "Pushing container to registry: $CONTAINER_PUSH" - skopeo copy --authfile="$REGISTRY_AUTH_FILE" "$PUSH_SRC" "docker://$CONTAINER_PUSH" - rm -rf "${OCI_DIR:-/tmp/nonexistent}" 2>/dev/null || true - echo "Container pushed successfully to $CONTAINER_PUSH" + ANNOTATE_DONE=$(date +%s) + echo "⏱ Container annotate: $((ANNOTATE_DONE - COPY_DONE))s" + + echo "Pushing container to registry: $CONTAINER_PUSH" + skopeo copy --authfile="$REGISTRY_AUTH_FILE" "oci:${OCI_DIR}:latest" "docker://$CONTAINER_PUSH" + rm -rf "${OCI_DIR:-/tmp/nonexistent}" 2>/dev/null || true + PUSH_DONE=$(date +%s) + echo "⏱ Container registry push: $((PUSH_DONE - ANNOTATE_DONE))s" + echo "⏱ Container push total: $((PUSH_DONE - PUSH_START))s" + echo "Container pushed successfully to $CONTAINER_PUSH" + ) & + CONTAINER_PUSH_PID=$! fi - if [ "$BUILD_DISK_IMAGE" = "true" ]; then + if [ "$SPLIT_BUILD" = "true" ]; then + # Phase 2b: Create disk image in foreground (runs in parallel with container push) + echo "Creating disk image from container (parallel with push)..." + DISK_START=$(date +%s) + aib --verbose to-disk-image \ + "${FORMAT_ARGS[@]}" \ + "${BUILD_CONTAINER_ARGS[@]}" \ + "${AIB_EXTRA_ARGS[@]}" \ + "$BOOTC_CONTAINER_NAME" \ + "/output/${exportFile}" + DISK_DONE=$(date +%s) + echo "⏱ Disk creation: $((DISK_DONE - DISK_START))s" + echo "Disk image created: /output/${exportFile}" + elif [ "$BUILD_DISK_IMAGE" = "true" ]; then echo "Disk image created: /output/${exportFile}" - # Note: Disk image push to OCI registry is handled by the separate push-disk-artifact task fi + # Note: Disk image push to OCI registry is handled by the separate push-disk-artifact task ;; image|package) declare -a AIB_CMD=( @@ -490,45 +547,37 @@ PYEOF ;; esac -echo "Build completed. Contents of output directory:" -ls -la /output/ || true +AIB_END_TIME=$(date +%s) +echo "⏱ AIB build phase: $((AIB_END_TIME - BUILD_START_TIME))s" + +# Wait for background AIB metadata capture to finish +wait $AIB_METADATA_PID 2>/dev/null || true +AIB_VERSION=$(cat /tmp/aib-version.txt 2>/dev/null || echo "") +echo -n "${AIB_VERSION}" > /tekton/results/aib-version +AIB_IMAGE_PINNED=$(cat /tmp/aib-pinned.txt 2>/dev/null || echo "$AIB_IMAGE_REF") +echo "AIB image pinned reference: $AIB_IMAGE_PINNED" +echo -n "${AIB_IMAGE_PINNED}" > /tekton/results/automotive-image-builder pushd /output mkdir -p "$WORKSPACE_PATH" # Check if disk image was created (only exists when BUILD_DISK_IMAGE=true or non-bootc mode) DISK_IMAGE_EXISTS=false +DISK_IMAGE_SOURCE="/output" if [ -e "/output/${exportFile}" ]; then DISK_IMAGE_EXISTS=true - ln -sf ./${exportFile} ./disk.img - - echo "copying build artifacts to shared workspace..." if [ -d "/output/${exportFile}" ]; then - echo "${exportFile} is a directory, copying recursively..." + echo "${exportFile} is a directory, copying recursively to workspace..." cp -rv "/output/${exportFile}" "$WORKSPACE_PATH/" || echo "Failed to copy ${exportFile}" - else - echo "${exportFile} is a regular file, copying..." - cp -v "/output/${exportFile}" "$WORKSPACE_PATH/" || echo "Failed to copy ${exportFile}" - fi - - pushd "$WORKSPACE_PATH" - if [ -d "${exportFile}" ]; then - echo "Creating symlink to directory ${exportFile}" - ln -sf ${exportFile} disk.img - elif [ -f "${exportFile}" ]; then - echo "Creating symlink to file ${exportFile}" - ln -sf ${exportFile} disk.img + DISK_IMAGE_SOURCE="$WORKSPACE_PATH" fi - popd + # For regular files, skip copy — compress directly from /output later else echo "No disk image created (container-only build)" fi -cp -v "$BUILD_DIR/image.json" "$WORKSPACE_PATH/image.json" || echo "Failed to copy image.json" - -echo "Contents of shared workspace:" -ls -la "$WORKSPACE_PATH/" +cp "$BUILD_DIR/image.json" "$WORKSPACE_PATH/image.json" || echo "Failed to copy image.json" COMPRESSION="$(params.compression)" if [ "$BUILD_DISK_IMAGE" = "true" ] || [ "$BUILD_MODE" = "image" ] || [ "$BUILD_MODE" = "package" ] || [ "$BUILD_MODE" = "disk" ]; then @@ -647,9 +696,9 @@ elif [ -d "$WORKSPACE_PATH/${exportFile}" ]; then ls -la "$WORKSPACE_PATH/${final_compressed_name}-parts/" || true fi fi -elif [ -f "$WORKSPACE_PATH/${exportFile}" ]; then - echo "Creating compressed file ${exportFile}${EXT_FILE} in shared workspace..." - compress_file "$WORKSPACE_PATH/${exportFile}" "$WORKSPACE_PATH/${exportFile}${EXT_FILE}" || echo "Failed to create ${exportFile}${EXT_FILE}" +elif [ -f "${DISK_IMAGE_SOURCE}/${exportFile}" ]; then + echo "Compressing ${exportFile} directly to workspace..." + compress_file "${DISK_IMAGE_SOURCE}/${exportFile}" "$WORKSPACE_PATH/${exportFile}${EXT_FILE}" || { echo "Error: Failed to compress ${exportFile}"; exit 1; } echo "Compressed file size:" && ls -lah "$WORKSPACE_PATH/${exportFile}${EXT_FILE}" || true if [ -f "$WORKSPACE_PATH/${exportFile}${EXT_FILE}" ]; then pushd "$WORKSPACE_PATH" @@ -681,14 +730,22 @@ fi emit_progress "Finalizing build" "$PROGRESS_TOTAL" "$PROGRESS_TOTAL" if [ -n "$final_name" ]; then - echo "Writing artifact filename to Tekton result: $final_name" echo "$final_name" > /tekton/results/artifact-filename || echo "Failed to write Tekton result" - echo "Verifying Tekton result file:" - cat /tekton/results/artifact-filename || echo "Failed to read Tekton result" else echo "Warning: final_name is empty, no artifact filename will be recorded" fi -echo "Syncing filesystem to ensure all artifacts are written..." -sync -echo "Filesystem sync completed" \ No newline at end of file +# Wait for background container push to complete (bootc mode only) +if [ -n "${CONTAINER_PUSH_PID:-}" ]; then + echo "Waiting for container push to complete..." + wait "$CONTAINER_PUSH_PID" || { echo "Error: Container push failed"; exit 1; } +fi + +BUILD_END_TIME=$(date +%s) +echo "⏱ Post-build phase: $((BUILD_END_TIME - AIB_END_TIME))s" +echo "⏱ Total build-image step: $((BUILD_END_TIME - BUILD_START_TIME))s" + +# Write structured timing data as a Tekton result for Prometheus metrics +cat > /tekton/results/build-timing < /dev/null 2>&1 || true + > /dev/null 2>&1 || true) & } INTERNAL_REGISTRY="image-registry.openshift-image-registry.svc:5000" diff --git a/internal/common/tasks/tasks.go b/internal/common/tasks/tasks.go index 69e44af0..4bc3f3ea 100644 --- a/internal/common/tasks/tasks.go +++ b/internal/common/tasks/tasks.go @@ -428,6 +428,10 @@ func GenerateBuildAutomotiveImageTask(namespace string, buildConfig *BuildConfig Name: "aib-command", Description: "The exact AIB command used to build the image", }, + { + Name: "build-timing", + Description: "JSON timing breakdown of build phases in seconds", + }, }, Workspaces: []tektonv1.WorkspaceDeclaration{ { @@ -583,7 +587,7 @@ func GenerateBuildAutomotiveImageTask(namespace string, buildConfig *BuildConfig for i := range task.Spec.Volumes { vol := &task.Spec.Volumes[i] - if vol.Name == "build-dir" || vol.Name == "run-dir" || vol.Name == "container-storage" { + if vol.Name == "build-dir" || vol.Name == "run-dir" || vol.Name == "container-storage" || vol.Name == "output-dir" { vol.EmptyDir = &corev1.EmptyDirVolumeSource{ Medium: corev1.StorageMediumMemory, } @@ -856,6 +860,11 @@ func GenerateTektonPipeline(name, namespace string, buildConfig *BuildConfig) *t Description: "The Jumpstarter lease ID acquired during flash (empty if flash not enabled)", Value: tektonv1.ParamValue{Type: tektonv1.ParamTypeString, StringVal: "$(tasks.flash-image.results.lease-id)"}, }, + { + Name: "build-timing", + Description: "JSON timing breakdown of build phases in seconds", + Value: tektonv1.ParamValue{Type: tektonv1.ParamTypeString, StringVal: "$(tasks.build-image.results.build-timing)"}, + }, }, Tasks: []tektonv1.PipelineTask{ { diff --git a/internal/controller/imagebuild/controller.go b/internal/controller/imagebuild/controller.go index 51a903d3..fc769c08 100644 --- a/internal/controller/imagebuild/controller.go +++ b/internal/controller/imagebuild/controller.go @@ -354,6 +354,8 @@ func (r *ImageBuildReconciler) checkBuildProgress( log.Error(err, "Failed to patch status to Completed") return ctrl.Result{}, err } + recordBuildMetrics(fresh, pipelineRun, buildStatusSuccess) + r.emitEventf( fresh, corev1.EventTypeNormal, @@ -384,6 +386,7 @@ func (r *ImageBuildReconciler) checkBuildProgress( log.Error(err, "Failed to update status to Failed") return ctrl.Result{}, err } + recordBuildMetrics(imageBuild, pipelineRun, buildStatusFailure) return ctrl.Result{}, nil } @@ -1604,6 +1607,49 @@ func taskRunFailureMessage(taskRun *tektonv1.TaskRun, fallback string) string { } // extractProvenance extracts build provenance information from PipelineRun results +// buildTiming holds the timing breakdown written by build_image.sh. +type buildTiming struct { + SetupS float64 `json:"setup_s"` + BuildS float64 `json:"build_s"` + PostBuildS float64 `json:"post_build_s"` +} + +// recordBuildMetrics records Prometheus metrics from a completed build. +func recordBuildMetrics(imageBuild *automotivev1alpha1.ImageBuild, pipelineRun *tektonv1.PipelineRun, status string) { + mode := imageBuild.Spec.GetMode() + distro := imageBuild.Spec.GetDistro() + target := imageBuild.Spec.GetTarget() + format := imageBuild.Spec.GetExportFormat() + arch := imageBuild.Spec.Architecture + + BuildTotal.WithLabelValues(mode, distro, target, format, arch, status).Inc() + + // Record wall-clock duration from CR timestamps + if imageBuild.Status.StartTime != nil && imageBuild.Status.CompletionTime != nil { + duration := imageBuild.Status.CompletionTime.Sub(imageBuild.Status.StartTime.Time).Seconds() + BuildDuration.WithLabelValues(mode, distro, target, format, arch, status).Observe(duration) + } + + // Record phase-level timing from the build-timing Tekton result + if status != buildStatusSuccess || pipelineRun == nil { + return + } + for _, result := range pipelineRun.Status.Results { + if result.Name != "build-timing" { + continue + } + var timing buildTiming + if err := json.Unmarshal([]byte(result.Value.StringVal), &timing); err != nil { + break + } + labels := []string{mode, distro, target} + BuildPhaseDuration.WithLabelValues(append(labels, "setup")...).Observe(timing.SetupS) + BuildPhaseDuration.WithLabelValues(append(labels, "build")...).Observe(timing.BuildS) + BuildPhaseDuration.WithLabelValues(append(labels, "post_build")...).Observe(timing.PostBuildS) + break + } +} + func extractProvenance(pipelineRun *tektonv1.PipelineRun, aibImage string) (aibImageUsed, builderImageUsed string) { aibImageUsed = aibImage // Always record the AIB image that was requested diff --git a/internal/controller/imagebuild/metrics.go b/internal/controller/imagebuild/metrics.go new file mode 100644 index 00000000..05e64676 --- /dev/null +++ b/internal/controller/imagebuild/metrics.go @@ -0,0 +1,59 @@ +package imagebuild + +import ( + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +const ( + metricsNamespace = "ado" + metricsSubsystem = "build" + + buildStatusSuccess = "success" + buildStatusFailure = "failure" +) + +var ( + // BuildDuration tracks the total build duration in seconds. + BuildDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: metricsNamespace, + Subsystem: metricsSubsystem, + Name: "duration_seconds", + Help: "Total build duration in seconds", + Buckets: []float64{30, 60, 120, 180, 240, 300, 420, 600, 900, 1200}, + }, + []string{"mode", "distro", "target", "format", "arch", "status"}, + ) + + // BuildPhaseDuration tracks duration of individual build phases in seconds. + BuildPhaseDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: metricsNamespace, + Subsystem: metricsSubsystem, + Name: "phase_duration_seconds", + Help: "Duration of individual build phases in seconds", + Buckets: []float64{1, 5, 10, 30, 60, 120, 180, 240, 300, 600}, + }, + []string{"mode", "distro", "target", "phase"}, + ) + + // BuildTotal counts total builds by status. + BuildTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: metricsSubsystem, + Name: "total", + Help: "Total number of builds by status", + }, + []string{"mode", "distro", "target", "format", "arch", "status"}, + ) +) + +func init() { + metrics.Registry.MustRegister( + BuildDuration, + BuildPhaseDuration, + BuildTotal, + ) +} diff --git a/internal/controller/imagebuild/metrics_test.go b/internal/controller/imagebuild/metrics_test.go new file mode 100644 index 00000000..7a5af4a4 --- /dev/null +++ b/internal/controller/imagebuild/metrics_test.go @@ -0,0 +1,255 @@ +package imagebuild + +import ( + "testing" + "time" + + automotivev1alpha1 "github.com/centos-automotive-suite/automotive-dev-operator/api/v1alpha1" + "github.com/prometheus/client_golang/prometheus" + io_prometheus_client "github.com/prometheus/client_model/go" + tektonv1 "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// counterValue returns the current value of a counter with the given labels. +func counterValue(cv *prometheus.CounterVec, labels ...string) float64 { + m := &io_prometheus_client.Metric{} + if err := cv.WithLabelValues(labels...).Write(m); err != nil { + return 0 + } + return m.GetCounter().GetValue() +} + +// histogramCount returns the sample count of a histogram with the given labels. +func histogramCount(hv *prometheus.HistogramVec, labels ...string) uint64 { + m := &io_prometheus_client.Metric{} + obs, err := hv.GetMetricWithLabelValues(labels...) + if err != nil { + return 0 + } + if err := obs.(prometheus.Metric).Write(m); err != nil { + return 0 + } + return m.GetHistogram().GetSampleCount() +} + +// histogramSum returns the sample sum of a histogram with the given labels. +func histogramSum(hv *prometheus.HistogramVec, labels ...string) float64 { + m := &io_prometheus_client.Metric{} + obs, err := hv.GetMetricWithLabelValues(labels...) + if err != nil { + return 0 + } + if err := obs.(prometheus.Metric).Write(m); err != nil { + return 0 + } + return m.GetHistogram().GetSampleSum() +} + +func newImageBuild(mode, target, format, arch string, start, end *metav1.Time) *automotivev1alpha1.ImageBuild { + ib := &automotivev1alpha1.ImageBuild{ + Spec: automotivev1alpha1.ImageBuildSpec{ + Architecture: arch, + AIB: &automotivev1alpha1.AIBSpec{ + Distro: "autosd", + Target: target, + Mode: mode, + }, + Export: &automotivev1alpha1.ExportSpec{ + Format: format, + }, + }, + Status: automotivev1alpha1.ImageBuildStatus{ + StartTime: start, + CompletionTime: end, + }, + } + return ib +} + +func pipelineRunWithTiming(json string) *tektonv1.PipelineRun { + return &tektonv1.PipelineRun{ + Status: tektonv1.PipelineRunStatus{ + PipelineRunStatusFields: tektonv1.PipelineRunStatusFields{ + Results: []tektonv1.PipelineRunResult{ + { + Name: "build-timing", + Value: tektonv1.ResultValue{Type: tektonv1.ParamTypeString, StringVal: json}, + }, + }, + }, + }, + } +} + +func TestRecordBuildMetrics_Counter(t *testing.T) { + labels := []string{"package", "autosd", "ebbr", "simg", "arm64", "success"} + before := counterValue(BuildTotal, labels...) + + start := metav1.NewTime(time.Now().Add(-3 * time.Minute)) + end := metav1.Now() + ib := newImageBuild("package", "ebbr", "simg", "arm64", &start, &end) + pr := pipelineRunWithTiming(`{"setup_s":2,"build_s":170,"post_build_s":8,"total_s":180}`) + + recordBuildMetrics(ib, pr, buildStatusSuccess) + + after := counterValue(BuildTotal, labels...) + if after-before != 1 { + t.Errorf("BuildTotal counter increment = %v, want 1", after-before) + } +} + +func TestRecordBuildMetrics_FailureCounter(t *testing.T) { + labels := []string{"package", "autosd", "ebbr", "simg", "amd64", "failure"} + before := counterValue(BuildTotal, labels...) + + ib := newImageBuild("package", "ebbr", "simg", "amd64", nil, nil) + recordBuildMetrics(ib, nil, buildStatusFailure) + + after := counterValue(BuildTotal, labels...) + if after-before != 1 { + t.Errorf("BuildTotal failure counter increment = %v, want 1", after-before) + } +} + +func TestRecordBuildMetrics_Duration(t *testing.T) { + labels := []string{"package", "autosd", "ebbr", "simg", "arm64", "success"} + beforeCount := histogramCount(BuildDuration, labels...) + beforeSum := histogramSum(BuildDuration, labels...) + + start := metav1.NewTime(time.Now().Add(-180 * time.Second)) + end := metav1.Now() + ib := newImageBuild("package", "ebbr", "simg", "arm64", &start, &end) + pr := pipelineRunWithTiming(`{"setup_s":2,"build_s":170,"post_build_s":8,"total_s":180}`) + + recordBuildMetrics(ib, pr, buildStatusSuccess) + + afterCount := histogramCount(BuildDuration, labels...) + afterSum := histogramSum(BuildDuration, labels...) + if afterCount-beforeCount != 1 { + t.Errorf("BuildDuration sample count increment = %v, want 1", afterCount-beforeCount) + } + delta := afterSum - beforeSum + if delta < 179 || delta > 181 { + t.Errorf("BuildDuration observed value = %v, want ~180", delta) + } +} + +func TestRecordBuildMetrics_NoDurationWithoutTimestamps(t *testing.T) { + labels := []string{"bootc", "autosd", "ebbr", "simg", "arm64", "success"} + beforeCount := histogramCount(BuildDuration, labels...) + + ib := newImageBuild("bootc", "ebbr", "simg", "arm64", nil, nil) + recordBuildMetrics(ib, &tektonv1.PipelineRun{}, buildStatusSuccess) + + afterCount := histogramCount(BuildDuration, labels...) + if afterCount != beforeCount { + t.Errorf("BuildDuration should not record without timestamps, got count delta %v", afterCount-beforeCount) + } +} + +func TestRecordBuildMetrics_PhaseDurations(t *testing.T) { + setupLabels := []string{"package", "autosd", "ebbr", "setup"} + buildLabels := []string{"package", "autosd", "ebbr", "build"} + postLabels := []string{"package", "autosd", "ebbr", "post_build"} + + beforeSetup := histogramCount(BuildPhaseDuration, setupLabels...) + beforeBuild := histogramCount(BuildPhaseDuration, buildLabels...) + beforePost := histogramCount(BuildPhaseDuration, postLabels...) + + start := metav1.NewTime(time.Now().Add(-3 * time.Minute)) + end := metav1.Now() + ib := newImageBuild("package", "ebbr", "simg", "arm64", &start, &end) + pr := pipelineRunWithTiming(`{"setup_s":5,"build_s":160,"post_build_s":15,"total_s":180}`) + + recordBuildMetrics(ib, pr, buildStatusSuccess) + + if histogramCount(BuildPhaseDuration, setupLabels...)-beforeSetup != 1 { + t.Error("setup phase not recorded") + } + if histogramCount(BuildPhaseDuration, buildLabels...)-beforeBuild != 1 { + t.Error("build phase not recorded") + } + if histogramCount(BuildPhaseDuration, postLabels...)-beforePost != 1 { + t.Error("post_build phase not recorded") + } + + // Verify observed values + setupSum := histogramSum(BuildPhaseDuration, setupLabels...) + if setupSum < 5 { + t.Errorf("setup phase sum = %v, want >= 5", setupSum) + } +} + +func TestRecordBuildMetrics_NoPhaseDurationsOnFailure(t *testing.T) { + labels := []string{"package", "autosd", "x86", "setup"} + beforeCount := histogramCount(BuildPhaseDuration, labels...) + + ib := newImageBuild("package", "x86", "simg", "amd64", nil, nil) + pr := pipelineRunWithTiming(`{"setup_s":5,"build_s":160,"post_build_s":15,"total_s":180}`) + + recordBuildMetrics(ib, pr, buildStatusFailure) + + afterCount := histogramCount(BuildPhaseDuration, labels...) + if afterCount != beforeCount { + t.Error("phase durations should not be recorded on failure") + } +} + +func TestRecordBuildMetrics_MalformedTimingJSON(t *testing.T) { + labels := []string{"package", "autosd", "qemu", "setup"} + beforeCount := histogramCount(BuildPhaseDuration, labels...) + + start := metav1.NewTime(time.Now().Add(-1 * time.Minute)) + end := metav1.Now() + ib := newImageBuild("package", "qemu", "qcow2", "amd64", &start, &end) + pr := pipelineRunWithTiming(`not valid json`) + + // Should not panic or record phase metrics + recordBuildMetrics(ib, pr, buildStatusSuccess) + + afterCount := histogramCount(BuildPhaseDuration, labels...) + if afterCount != beforeCount { + t.Error("malformed JSON should not produce phase metrics") + } +} + +func TestRecordBuildMetrics_NilPipelineRun(t *testing.T) { + labels := []string{"package", "autosd", "none", "simg", "amd64", "success"} + before := counterValue(BuildTotal, labels...) + + ib := newImageBuild("package", "none", "simg", "amd64", nil, nil) + + // Should not panic with nil PipelineRun + recordBuildMetrics(ib, nil, buildStatusSuccess) + + after := counterValue(BuildTotal, labels...) + if after-before != 1 { + t.Errorf("counter should still increment with nil PipelineRun, got delta %v", after-before) + } +} + +func TestRecordBuildMetrics_NoTimingResult(t *testing.T) { + labels := []string{"package", "autosd", "generic", "setup"} + beforeCount := histogramCount(BuildPhaseDuration, labels...) + + start := metav1.NewTime(time.Now().Add(-1 * time.Minute)) + end := metav1.Now() + ib := newImageBuild("package", "generic", "raw", "amd64", &start, &end) + pr := &tektonv1.PipelineRun{ + Status: tektonv1.PipelineRunStatus{ + PipelineRunStatusFields: tektonv1.PipelineRunStatusFields{ + Results: []tektonv1.PipelineRunResult{ + {Name: "other-result", Value: tektonv1.ResultValue{StringVal: "foo"}}, + }, + }, + }, + } + + recordBuildMetrics(ib, pr, buildStatusSuccess) + + afterCount := histogramCount(BuildPhaseDuration, labels...) + if afterCount != beforeCount { + t.Error("should not record phase metrics when build-timing result is absent") + } +} diff --git a/internal/controller/operatorconfig/resources.go b/internal/controller/operatorconfig/resources.go index 58e4e9c0..7ae4a671 100644 --- a/internal/controller/operatorconfig/resources.go +++ b/internal/controller/operatorconfig/resources.go @@ -508,7 +508,7 @@ func (r *OperatorConfigReconciler) buildBuildControllerDeployment(namespace stri "--mode=build", "--leader-elect", "--health-probe-bind-address=:8081", - "--metrics-bind-address=0", + "--metrics-bind-address=:8443", }, Env: []corev1.EnvVar{ { @@ -526,6 +526,11 @@ func (r *OperatorConfigReconciler) buildBuildControllerDeployment(namespace stri ContainerPort: 8081, Protocol: corev1.ProtocolTCP, }, + { + Name: "https", + ContainerPort: 8443, + Protocol: corev1.ProtocolTCP, + }, }, LivenessProbe: &corev1.Probe{ ProbeHandler: corev1.ProbeHandler{