openshift · openshift-merge-robot · Jul 21, 2020 · Jul 9, 2020
diff --git a/ci-operator/README.md b/ci-operator/README.md
@@ -369,6 +369,31 @@ ci-operator \
     --namespace mynamespace
 ```
 
+### Rebalancing tests among platforms
+
+If test volume for a given platform exceeds [the Boskos lease capacity][boskos-leases], [`jobs-failing-with-lease-acquire-timeout`](../clusters/app.ci/prow-monitoring/mixins/prometheus_out/prometheus-prow-rules_prometheusrule.yaml) will fire.
+Presubmit jobs may be rebalanced to move platform-agnostic jobs to platforms with available capacity.
+Component teams may mark their presubmit jobs as platform-agnostic by configuring `as` names which exclude the platform slug (e.g. `aws`), whose absence is used as a marker of "this test is platform-agnostic".
+For example, see [release#10152][release-10152].
+To locate platform-specific jobs which might be good candidates for moving to the platform-agnostic pool, you can use:
+
+```console
+$ hack/step-jobs-by-platform.py
+workflows which need alternative platforms to support balancing:
+  baremetalds-e2e
+  ipi-aws
+  ipi-aws-ovn-hybrid
+  openshift-e2e-aws-csi
+...
+count	platform	status	alternatives	job
+39	gcp	balanceable	aws,azure,vsphere	pull-ci-openshift-cluster-version-operator-master-e2e
+26	aws	unknown	azure,gcp,vsphere	pull-ci-openshift-sriov-dp-admission-controller-master-e2e-aws
+15	aws	unknown	azure,gcp,vsphere	pull-ci-openshift-cluster-authentication-operator-master-e2e-aws
+10	aws	balanceable	azure,vsphere	pull-ci-openshift-machine-config-operator-master-e2e-ovn-step-registry
+9	aws	unknown	gcp	pull-ci-openshift-cluster-samples-operator-release-4.1-e2e-aws-image-ecosystem
+...
+```
+
 ### Rebalancing AWS tests among regions and zones
 
 Occasionally we hit install errors like:
@@ -400,3 +425,4 @@ Focusing on [step-registry](step-registry) consumers, you could avoid us-east-1b
 
 [aws-creating-shared-subnets]: https://github.com/openshift/release/pull/6949/commits/1b21187950b7d1d83f87774e9c52e74616e1b6c4
 [boskos-leases]: https://steps.ci.openshift.org/help/leases
+[release-10152]: https://github.com/openshift/release/pull/10152
diff --git a/hack/step-jobs-by-platform.py b/hack/step-jobs-by-platform.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+
+import codecs
+import json
+import os
+from urllib.request import urlopen
+
+import yaml
+
+
+def load_config(directory):
+    _repo_config = {}
+    for _basedir, _, _filenames in os.walk(directory):
+        for _filename in _filenames:
+            if not _filename.endswith('.yaml'):
+                continue
+            _path = os.path.join(_basedir, _filename)
+            try:
+                with open(_path, 'r') as f:
+                    _config = yaml.load(f)
+            except:
+                print('failed to load YAML from {}'.format(_path))
+                raise
+            if 'zz_generated_metadata' not in _config:
+                continue
+            _org_repo = '{org}/{repo}'.format(**_config['zz_generated_metadata'])
+            if _org_repo not in _repo_config:
+                _repo_config[_org_repo] = {}
+            for _test in _config.get('tests', []):
+                if 'cluster_profile' not in _test.get('steps', {}):
+                    continue
+                _job_name = 'pull-ci-{org}-{repo}-{branch}-{test_as}'.format(test_as=_test['as'], **_config['zz_generated_metadata'])
+                _test['steps']['platform'] = cluster_profile_platform(cluster_profile=_test['steps']['cluster_profile'])
+                _repo_config[_org_repo][_job_name] = _test['steps']
+    return _repo_config
+
+
+def platform_stripped_workflows(repo_config):
+    _unstrippable = {}
+    _stripped = {}
+    for _jobs in repo_config.values():
+        for _job, _steps in _jobs.items():
+            _stripped_workflow = platform_stripped_workflow(workflow=_steps['workflow'], platform=_steps['platform'])
+            if not _stripped_workflow:
+                if _steps['workflow'] not in _unstrippable:
+                    _unstrippable[_steps['workflow']] = {}
+                if _steps['platform'] not in _unstrippable[_steps['workflow']]:
+                    _unstrippable[_steps['workflow']][_steps['platform']] = set()
+                _unstrippable[_steps['workflow']][_steps['platform']].add(_job)
+                continue
+            if _stripped_workflow not in _stripped:
+                _stripped[_stripped_workflow] = {}
+            _stripped[_stripped_workflow][_steps['platform']] = _steps['workflow']
+    if _unstrippable:
+        print('unable to determine platform-agnostic workflows for:')
+        for _workflow, _platforms in sorted(_unstrippable.items()):
+            print('  {}'.format(_workflow))
+            for _platform, _jobs in sorted(_platforms.items()):
+                _ellipsis = ''
+                if len(_jobs) > 3:
+                    _ellipsis = ', ...'
+                print('    {} ({}{})'.format(_platform, ', '.join(sorted(_jobs)[:3]), _ellipsis))
+    return _stripped
+
+
+def yield_interesting_jobs(repo_config, balanceable_workflows):
+    for _jobs in repo_config.values():
+        for _job, _steps in _jobs.items():
+            _stripped_workflow = platform_stripped_workflow(workflow=_steps['workflow'], platform=_steps['platform'])
+            if _stripped_workflow in balanceable_workflows:
+                yield _job
+
+
+def cluster_profile_platform(cluster_profile):
+    """Translate from steps.cluster_profile to workflow.as slugs."""
+    if cluster_profile == 'azure4':
+        return 'azure'
+    if cluster_profile == 'packet':
+        return 'metal'
+    return cluster_profile
+
+
+def platform_stripped_workflow(workflow, platform):
+    _key = workflow.replace(platform, 'PLATFORM')
+    if 'PLATFORM' in _key:
+        return _key
+    return None
+
+
+def get_prow_job_counts(uri, interesting_jobs):
+    with urlopen(uri) as response:
+        _jobs = json.load(codecs.getreader('utf-8')(response))
+    _counts = {}
+    for _job in _jobs.get('items', []):
+        _name = _job['spec']['job']
+        if _name not in interesting_jobs:
+            continue
+        _counts[_name] = _counts.get(_name, 0) + 1
+    return _counts
+
+
+def print_counts(counts, job_steps, job_org_repos, stripped_workflows, platform_specific_repositories):
+    print('{}\t{}\t{}\t{}\t{}'.format('count', 'platform', 'status', 'alternatives', 'job'))
+    for _job, _count in sorted(counts.items(), key=lambda job_count: -job_count[1]):
+        _steps = job_steps[_job]
+        _stripped_workflow = platform_stripped_workflow(workflow=_steps['workflow'], platform=_steps['platform'])
+        _alternative_platforms = sorted(key for key in stripped_workflows[_stripped_workflow].keys() if key != _steps['platform'])
+        if _steps['platform'] not in _job:
+            _status = 'balanceable'
+        elif job_org_repos[_job] in platform_specific_repositories:
+            continue
+        else:
+            _status = 'unknown'
+        print('{}\t{}\t{}\t{}\t{}'.format(_count, _steps['platform'], _status, ','.join(_alternative_platforms), _job))
+
+
+if __name__ == '__main__':
+    _repo_config = load_config(directory=os.path.join('ci-operator', 'config', 'openshift'))
+    platforms = set()
+    _job_steps = {}
+    _job_org_repos = {}
+    for _org_repo, _jobs in _repo_config.items():
+        for _job, _steps in _jobs.items():
+            platforms.add(_steps['platform'])
+            _job_steps[_job] = _steps
+            _job_org_repos[_job] = _org_repo
+    _stripped_workflows = platform_stripped_workflows(repo_config=_repo_config)
+    _balanceable_workflows = {workflow for workflow, platforms in _stripped_workflows.items() if len(platforms) > 1}
+    fixed_workflows = set(_stripped_workflows.keys()) - _balanceable_workflows
+    if fixed_workflows:
+        print('workflows which need alternative platforms to support balancing:')
+        for _workflow in sorted(fixed_workflows):
+            print('  {}'.format(list(_stripped_workflows[_workflow].values())[0]))
+    _interesting_jobs = set(yield_interesting_jobs(repo_config=_repo_config, balanceable_workflows=_balanceable_workflows))
+    _counts = get_prow_job_counts(uri='https://prow.svc.ci.openshift.org/prowjobs.js', interesting_jobs=_interesting_jobs)
+    _platform_specific_repositories = {
+        'openshift/cloud-credential-operator',
+        'openshift/installer',
+        'openshift/machine-config-operator',
+    }
+    print_counts(counts=_counts, job_steps=_job_steps, job_org_repos=_job_org_repos, stripped_workflows=_stripped_workflows, platform_specific_repositories=_platform_specific_repositories)