diff --git a/.github/workflows/usage-metrics.yml b/.github/workflows/usage-metrics.yml index 55a09fc246..9ba518153e 100644 --- a/.github/workflows/usage-metrics.yml +++ b/.github/workflows/usage-metrics.yml @@ -64,7 +64,7 @@ jobs: # Query all versions in a single command go run collect-metrics.go $VERSION_FLAGS -csv "../docs/usage-metrics.csv" - - name: Create Pull Request + - name: Create or update Pull Request run: | git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" @@ -75,16 +75,41 @@ jobs: exit 0 fi - # Create a new branch for the PR DATE=$(date +%Y-%m-%d) - BRANCH_NAME="chore/update-usage-metrics-$DATE" - git checkout -b "$BRANCH_NAME" + MONTH=$(date +%Y-%m) + BRANCH_NAME="chore/update-usage-metrics-$MONTH" + + # Check if the branch already exists on the remote + if git ls-remote --heads origin "$BRANCH_NAME" | grep -q "$BRANCH_NAME"; then + echo "Branch $BRANCH_NAME already exists, pushing new commit to it" + # Preserve the newly collected metrics before switching branches + cp docs/usage-metrics.csv /tmp/usage-metrics.csv + git fetch origin "$BRANCH_NAME" + git checkout "$BRANCH_NAME" + # Restore the newly collected metrics on top of the existing branch + cp /tmp/usage-metrics.csv docs/usage-metrics.csv + git add docs/usage-metrics.csv + # Check again after checkout — the diff might be empty if the + # branch already has identical data + if git diff --staged --quiet; then + echo "No changes to commit after checking out existing branch" + exit 0 + fi + else + echo "Creating new branch $BRANCH_NAME" + git checkout -b "$BRANCH_NAME" + fi git commit -m "chore(metrics): update usage metrics ($DATE)" git push -u origin "$BRANCH_NAME" - # Create PR using gh CLI - gh pr create \ - --title "chore: update usage metrics ($DATE)" \ - --body "Automated update of usage metrics data. This PR updates the usage metrics CSV file with the latest GitHub usage data for testcontainers-go versions." \ - --base main + # Create a PR only if one doesn't already exist for this branch + EXISTING_PR=$(gh pr list --head "$BRANCH_NAME" --state open --json number --jq '.[0].number') + if [ -n "$EXISTING_PR" ]; then + echo "PR #$EXISTING_PR already exists for branch $BRANCH_NAME, updated with new commit" + else + gh pr create \ + --title "chore: update usage metrics ($MONTH)" \ + --body "Automated update of usage metrics data. This PR updates the usage metrics CSV file with the latest GitHub usage data for testcontainers-go versions." \ + --base main + fi diff --git a/usage-metrics/collect-metrics.go b/usage-metrics/collect-metrics.go index d28ff09186..2bd9ef4cad 100644 --- a/usage-metrics/collect-metrics.go +++ b/usage-metrics/collect-metrics.go @@ -55,43 +55,80 @@ func main() { func collectMetrics(versions []string, csvPath string) error { date := time.Now().Format("2006-01-02") - metrics := make([]usageMetric, 0, len(versions)) + metrics := make(map[string]usageMetric) - // Query all versions sequentially + // Build a unique, non-empty list of versions to query + pending := make([]string, 0, len(versions)) + seen := make(map[string]struct{}, len(versions)) for _, version := range versions { version = strings.TrimSpace(version) if version == "" { continue } - - // Add delay BEFORE querying to avoid rate limiting - if len(metrics) > 0 { - log.Printf("Waiting 7 seconds before querying next version...") - time.Sleep(7 * time.Second) // 10 requests per 60 seconds = 6 seconds minimum + if _, ok := seen[version]; ok { + continue } + seen[version] = struct{}{} + pending = append(pending, version) + } + if len(pending) == 0 { + return errors.New("at least one non-empty version is required") + } - count, err := queryGitHubUsageWithRetry(version) - if err != nil { - log.Printf("Warning: Failed to query version %s after retries: %v", version, err) - continue + const ( + maxPasses = 5 + interRequestWait = 7 * time.Second // 10 requests per 60 seconds = 6 seconds minimum + passCooldown = 120 * time.Second // wait for rate limit window to fully reset between passes + ) + + for pass := 0; pass < maxPasses && len(pending) > 0; pass++ { + if pass > 0 { + log.Printf("Pass %d: waiting %v for rate limit window to reset before retrying %d failed version(s)...", + pass+1, passCooldown, len(pending)) + time.Sleep(passCooldown) + } else { + log.Printf("Pass 1: querying %d version(s)...", len(pending)) } - metric := usageMetric{ - Date: date, - Version: version, - Count: count, + var failed []string + queriesMade := 0 + for _, version := range pending { + // Add delay before querying to avoid rate limiting + if queriesMade > 0 { + log.Printf("Waiting %v before querying next version...", interRequestWait) + time.Sleep(interRequestWait) + } + + count, err := queryGitHubUsage(version) + queriesMade++ + if err != nil { + log.Printf("Pass %d: failed to query version %s: %v", pass+1, version, err) + if isRetryableError(err) { + failed = append(failed, version) + continue + } + return fmt.Errorf("query %s: %w", version, err) + } + + metrics[version] = usageMetric{ + Date: date, + Version: version, + Count: count, + } + fmt.Printf("Successfully queried: %s has %d usages on %s\n", version, count, date) } - metrics = append(metrics, metric) - fmt.Printf("Successfully queried: %s has %d usages on %s\n", version, count, metric.Date) + pending = failed + if len(pending) == 0 { + log.Printf("All versions queried successfully after %d pass(es).", pass+1) + } } - // Sort metrics by version - sort.Slice(metrics, func(i, j int) bool { - return metrics[i].Version < metrics[j].Version - }) + if len(pending) > 0 { + log.Printf("Warning: %d version(s) still failed after %d passes: %s", len(pending), maxPasses, strings.Join(pending, ", ")) + } - // Write all metrics to CSV + // Append new metrics to CSV for _, metric := range metrics { if err := appendToCSV(csvPath, metric); err != nil { log.Printf("Warning: Failed to write metric for %s: %v", metric.Version, err) @@ -100,49 +137,25 @@ func collectMetrics(versions []string, csvPath string) error { fmt.Printf("Successfully recorded: %s has %d usages on %s\n", metric.Version, metric.Count, metric.Date) } - return nil -} - -func queryGitHubUsageWithRetry(version string) (int, error) { - var lastErr error - // Backoff intervals: wait longer for rate limit to reset (rolling window) - backoffIntervals := []time.Duration{ - 60 * time.Second, // Wait for rolling window - 60 * time.Second, - 60 * time.Second, + // Sort the entire CSV so rows are ordered by (date, version) regardless + // of the order they were appended across multiple runs. + if err := sortCSV(csvPath); err != nil { + return fmt.Errorf("sort csv: %w", err) } - // maxRetries includes the initial attempt plus one retry per backoff interval - maxRetries := len(backoffIntervals) + 1 - - for attempt := 0; attempt < maxRetries; attempt++ { - if attempt > 0 { - // Use predefined backoff intervals - waitTime := backoffIntervals[attempt-1] - log.Printf("Retrying version %s in %v (attempt %d/%d)", version, waitTime, attempt+1, maxRetries) - time.Sleep(waitTime) - } - - count, err := queryGitHubUsage(version) - if err == nil { - return count, nil - } - - lastErr = err - - // Check if it's a rate limit error - if strings.Contains(err.Error(), "rate limit") || - strings.Contains(err.Error(), "403") || - strings.Contains(err.Error(), "429") { - log.Printf("Rate limit hit for version %s, will retry with backoff", version) - continue - } - - // For non-rate-limit errors, retry but with shorter backoff - log.Printf("Error querying version %s: %v", version, err) - } + return nil +} - return 0, fmt.Errorf("max retries reached: %w", lastErr) +// isRetryableError returns true for rate-limit and transient HTTP errors +// that are worth retrying in a subsequent pass. +func isRetryableError(err error) bool { + msg := err.Error() + return strings.Contains(msg, "rate limit") || + strings.Contains(msg, "403") || + strings.Contains(msg, "429") || + strings.Contains(msg, "500") || + strings.Contains(msg, "502") || + strings.Contains(msg, "503") } func queryGitHubUsage(version string) (int, error) { @@ -173,6 +186,55 @@ func queryGitHubUsage(version string) (int, error) { return resp.TotalCount, nil } +func sortCSV(csvPath string) error { + absPath, err := filepath.Abs(csvPath) + if err != nil { + return fmt.Errorf("resolve path: %w", err) + } + + file, err := os.Open(absPath) + if err != nil { + return fmt.Errorf("open file: %w", err) + } + + reader := csv.NewReader(file) + records, err := reader.ReadAll() + file.Close() + if err != nil { + return fmt.Errorf("read csv: %w", err) + } + + if len(records) <= 1 { + return nil // nothing to sort (header only or empty) + } + + header := records[0] + data := records[1:] + + sort.SliceStable(data, func(i, j int) bool { + if data[i][0] != data[j][0] { + return data[i][0] < data[j][0] // date ascending + } + return data[i][1] < data[j][1] // version ascending + }) + + out, err := os.Create(absPath) + if err != nil { + return fmt.Errorf("create file: %w", err) + } + defer out.Close() + + writer := csv.NewWriter(out) + if err := writer.Write(header); err != nil { + return fmt.Errorf("write header: %w", err) + } + if err := writer.WriteAll(data); err != nil { + return fmt.Errorf("write records: %w", err) + } + + return nil +} + func appendToCSV(csvPath string, metric usageMetric) error { absPath, err := filepath.Abs(csvPath) if err != nil {