diff --git a/.github/workflows/integration-server-test.yml b/.github/workflows/integration-server-test.yml index 65f0bc589cf..dec3af64d45 100644 --- a/.github/workflows/integration-server-test.yml +++ b/.github/workflows/integration-server-test.yml @@ -3,7 +3,23 @@ name: integration-server-test run-name: Integration Server Test on: - workflow_dispatch: ~ + workflow_dispatch: + inputs: + run-upgrade-tests: + description: 'Run upgrade tests (SNAPSHOT)' + required: false + type: boolean + default: true + run-upgrade-bc-tests: + description: 'Run upgrade tests (BC)' + required: false + type: boolean + default: true + run-standalone-tests: + description: 'Run standalone-to-managed tests' + required: false + type: boolean + default: true schedule: - cron: '0 2 * * 1-5' @@ -31,18 +47,17 @@ jobs: uses: ./.github/workflows/generate-bc-upgrade-paths run-upgrade: - name: Upgrade tests (Snapshot) + if: ${{ !contains(inputs.should_run, 'false') }} + name: Upgrade tests (SNAPSHOT) runs-on: ubuntu-latest strategy: fail-fast: false matrix: upgrade-path: - # Latest 8.15 cannot upgrade to latest 8.17, it can only go to 8.17.3. - # With our current setup (only latest patch), we have to upgrade to intermediate latest 8.16 instead. - # TODO: Maybe add support for upgrading to latest upgradable instead of absolute latest? - - '8.15, 8.16, 8.17' - - '8.17, 8.18, 9.0' - - '8.17, 8.19, 9.1' + - '8.15, 8.16, 8.17, 8.18' + - '8.18, 8.19, 9.2' + - '8.18, 9.0, 9.2' + - '8.19, 9.1, 9.2' scenario: - 'Default' - 'Reroute' @@ -68,6 +83,7 @@ jobs: SCENARIO="${{ matrix.scenario }}" UPGRADE_PATH="${{ matrix.upgrade-path }}" SNAPSHOT=true make integration-server-test/upgrade run-upgrade-bc: + if: ${{ !contains(inputs.should_run, 'false') }} name: Upgrade tests (BC) runs-on: ubuntu-latest needs: prepare @@ -100,6 +116,7 @@ jobs: SCENARIO="${{ matrix.scenario }}" UPGRADE_PATH="${{ matrix.upgrade-path }}" make integration-server-test/upgrade run-standalone: + if: ${{ !contains(inputs.should_run, 'false') }} name: Standalone-to-managed tests runs-on: ubuntu-latest strategy: diff --git a/integrationservertest/internal/gen/generator.go b/integrationservertest/internal/gen/generator.go index d521a0b5b07..d8106ec1dd3 100644 --- a/integrationservertest/internal/gen/generator.go +++ b/integrationservertest/internal/gen/generator.go @@ -75,6 +75,7 @@ func (g *Generator) RunBlockingWait(ctx context.Context, version ech.Version, in if !integrations { return fmt.Errorf("failed to wait for apm server: %w", err) } + g.logger.Info("re-apply apm policy") if err = g.reapplyAPMPolicy(ctx, version); err != nil { return fmt.Errorf("failed to re-apply apm policy: %w", err) } @@ -84,21 +85,12 @@ func (g *Generator) RunBlockingWait(ctx context.Context, version ech.Version, in } g.logger.Info("ingest data") - if err := g.runBlocking(ctx, version); err != nil { + if err := g.retryRunBlocking(ctx, version, 2); err != nil { return fmt.Errorf("cannot run generator: %w", err) } - // With Fleet managed APM server, we can trigger metrics flush. - if integrations { - g.logger.Info("flush apm metrics") - if err := g.flushAPMMetrics(ctx, version); err != nil { - return fmt.Errorf("cannot flush apm metrics: %w", err) - } - return nil - } - - // With standalone, we don't have Fleet, so simply just wait for some arbitrary time. - time.Sleep(180 * time.Second) + // Simply wait for some arbitrary time, for the data to be flushed. + time.Sleep(200 * time.Second) return nil } @@ -159,9 +151,35 @@ func (g *Generator) runBlocking(ctx context.Context, version ech.Version) error return gen.RunBlocking(ctx) } +// retryRunBlocking executes runBlocking. If it fails, it will retry up to retryTimes. +func (g *Generator) retryRunBlocking(ctx context.Context, version ech.Version, retryTimes int) error { + // No error, don't need to retry. + if err := g.runBlocking(ctx, version); err == nil { + return nil + } + + // Otherwise, retry until success or run out of attempts. + var finalErr error + for i := 0; i < retryTimes; i++ { + // Wait for some time before retrying. + time.Sleep(time.Duration(i) * 30 * time.Second) + + g.logger.Info(fmt.Sprintf("retrying ingest data attempt %d", i+1)) + err := g.runBlocking(ctx, version) + // Retry success, simply return. + if err == nil { + return nil + } + + finalErr = err + } + + return finalErr +} + func (g *Generator) reapplyAPMPolicy(ctx context.Context, version ech.Version) error { policyID := "elastic-cloud-apm" - description := fmt.Sprintf("%s %s", version, rand.Text()[5:]) + description := fmt.Sprintf("%s %s", version, rand.Text()[:10]) if err := g.kbc.UpdatePackagePolicyDescriptionByID(ctx, policyID, version, description); err != nil { return fmt.Errorf( @@ -173,22 +191,6 @@ func (g *Generator) reapplyAPMPolicy(ctx context.Context, version ech.Version) e return nil } -// flushAPMMetrics sends an update to the Fleet APM package policy in order -// to trigger the flushing of in-flight APM metrics. -func (g *Generator) flushAPMMetrics(ctx context.Context, version ech.Version) error { - // Re-applying the Elastic APM policy is enough to trigger final aggregations - // in APM Server and flush of in-flight metrics. - if err := g.reapplyAPMPolicy(ctx, version); err != nil { - return err - } - - // APM Server needs some time to flush all metrics, and we don't have any - // visibility on when this completes. - // NOTE: This value comes from empirical observations. - time.Sleep(120 * time.Second) - return nil -} - type apmInfoResp struct { Version string `json:"version"` PublishReady bool `json:"publish_ready"`