Merge pull request #200 from codecov/gio/label-compressed/report-with…

…-label-index Use encoded labels in reports with them
codecov · Dec 6, 2023 · 455f7e4 · 455f7e4
2 parents 58c31fa + ec947a4
commit 455f7e4
Show file tree

Hide file tree

Showing 27 changed files with 4,365 additions and 382 deletions.
diff --git a/helpers/labels.py b/helpers/labels.py
@@ -1,4 +1,5 @@
 from enum import Enum
+from typing import Set, Union
 
 import sentry_sdk
 from shared.reports.resources import Report
@@ -25,34 +26,53 @@
 
 
 class SpecialLabelsEnum(Enum):
-    CODECOV_ALL_LABELS_PLACEHOLDER = "Th2dMtk4M_codecov"
+    CODECOV_ALL_LABELS_PLACEHOLDER = ("Th2dMtk4M_codecov", 0)
 
-    def __init__(self, val):
-        self.corresponding_label = val
+    def __init__(self, label, index):
+        self.corresponding_label = label
+        self.corresponding_index = index
 
 
 @sentry_sdk.trace
-def get_labels_per_session(report: Report, sess_id: int):
+def get_labels_per_session(report: Report, sess_id: int) -> Union[Set[str], Set[int]]:
+    """Returns a Set with the labels present in a session from report, EXCLUDING the SpecialLabel.
+
+    The return value can either be a set of strings (the labels themselves) OR
+    a set of ints (the label indexes). The label can be looked up from the index
+    using Report.lookup_label_by_id(label_id) (assuming Report._labels_idx is set)
+    """
     all_labels = set()
     for rf in report:
         for _, line in rf.lines:
             if line.datapoints:
                 for datapoint in line.datapoints:
                     if datapoint.sessionid == sess_id:
-                        all_labels.update(datapoint.labels or [])
+                        all_labels.update(datapoint.label_ids or [])
     return all_labels - set(
-        [SpecialLabelsEnum.CODECOV_ALL_LABELS_PLACEHOLDER.corresponding_label]
+        [
+            SpecialLabelsEnum.CODECOV_ALL_LABELS_PLACEHOLDER.corresponding_label,
+            SpecialLabelsEnum.CODECOV_ALL_LABELS_PLACEHOLDER.corresponding_index,
+        ]
     )
 
 
 @sentry_sdk.trace
-def get_all_report_labels(report: Report) -> set:
+def get_all_report_labels(report: Report) -> Union[Set[str], Set[int]]:
+    """Returns a Set with the labels present in report EXCLUDING the SpecialLabel.
+
+    The return value can either be a set of strings (the labels themselves) OR
+    a set of ints (the label indexes). The label can be looked up from the index
+    using Report.lookup_label_by_id(label_id) (assuming Report._labels_idx is set)
+    """
     all_labels = set()
     for rf in report:
         for _, line in rf.lines:
             if line.datapoints:
                 for datapoint in line.datapoints:
-                    all_labels.update(datapoint.labels or [])
+                    all_labels.update(datapoint.label_ids or [])
     return all_labels - set(
-        [SpecialLabelsEnum.CODECOV_ALL_LABELS_PLACEHOLDER.corresponding_label]
+        [
+            SpecialLabelsEnum.CODECOV_ALL_LABELS_PLACEHOLDER.corresponding_label,
+            SpecialLabelsEnum.CODECOV_ALL_LABELS_PLACEHOLDER.corresponding_index,
+        ]
     )
diff --git a/requirements.in b/requirements.in
@@ -1,4 +1,4 @@
-https://github.com/codecov/shared/archive/9246e757c553be8b0e9eff28f038308914ec925c.tar.gz#egg=shared
+https://github.com/codecov/shared/archive/954a9ba17722f73f86f3d3bb3367f80d887fc800.tar.gz#egg=shared
 https://github.com/codecov/opentelem-python/archive/refs/tags/v0.0.4a1.tar.gz#egg=codecovopentelem
 boto3
 celery

diff --git a/requirements.txt b/requirements.txt
@@ -16,6 +16,8 @@ anyio==3.6.1
     # via
     #   httpcore
     #   openai
+asgiref==3.7.2
+    # via django
 async-timeout==4.0.2
     # via redis
 attrs==20.3.0
@@ -86,6 +88,8 @@ distlib==0.3.7
     # via virtualenv
 distro==1.8.0
     # via openai
+django==5.0
+    # via shared
 ecdsa==0.16.1
     # via tlslite-ng
 exceptiongroup==1.1.3
@@ -161,6 +165,8 @@ idna==2.10
     #   requests
     #   rfc3986
     #   yarl
+ijson==3.2.3
+    # via shared
 importlib-metadata==6.8.0
     # via opentelemetry-api
 iniconfig==1.1.1
@@ -320,7 +326,7 @@ s3transfer==0.3.4
     # via boto3
 sentry-sdk==1.19.1
     # via -r requirements.in
-shared @ https://github.com/codecov/shared/archive/9246e757c553be8b0e9eff28f038308914ec925c.tar.gz
+shared @ https://github.com/codecov/shared/archive/954a9ba17722f73f86f3d3bb3367f80d887fc800.tar.gz
     # via -r requirements.in
 six==1.15.0
     # via
@@ -340,11 +346,14 @@ sqlalchemy==1.3.23
     # via
     #   -r requirements.in
     #   pytest-sqlalchemy
+    #   shared
     #   sqlalchemy-utils
 sqlalchemy-utils==0.36.8
     # via
     #   -r requirements.in
     #   pytest-sqlalchemy
+sqlparse==0.4.4
+    # via django
 statsd==3.3.0
     # via
     #   -r requirements.in
@@ -365,6 +374,7 @@ typing==3.7.4.3
     # via shared
 typing-extensions==4.6.3
     # via
+    #   asgiref
     #   openai
     #   opentelemetry-sdk
     #   pydantic

diff --git a/rollouts/__init__.py b/rollouts/__init__.py
@@ -28,5 +28,8 @@ def repo_slug(repo: Repository) -> str:
 USE_LABEL_INDEX_IN_REPORT_PROCESSING_BY_REPO_SLUG = Feature(
     "use_label_index_in_report_processing",
     0.0,
-    overrides={"github/giovanni-guidini/sentry": "enabled"},
+    overrides={
+        "github/giovanni-guidini/sentry": "enabled",
+        "github/giovanni-guidini/components-demo": "enabled",
+    },
 )
diff --git a/services/archive.py b/services/archive.py
@@ -4,7 +4,7 @@
 from datetime import datetime
 from enum import Enum
 from hashlib import md5
-from typing import Dict
+from typing import Dict, Optional
 from uuid import uuid4
 
 from shared.config import get_config
@@ -20,9 +20,6 @@
 
 class MinioEndpoints(Enum):
     chunks = "{version}/repos/{repo_hash}/commits/{commitid}/{chunks_file_name}.txt"
-    label_index = (
-        "{version}/repos/{repo_hash}/commits/{commitid}/{label_index_file_name}.json"
-    )
     json_data = "{version}/repos/{repo_hash}/commits/{commitid}/json_data/{table}/{field}/{external_id}.json"
     json_data_no_commit = (
         "{version}/repos/{repo_hash}/json_data/{table}/{field}/{external_id}.json"
@@ -230,20 +227,6 @@ def write_json_data_to_storage(
         self.write_file(path, stringified_data)
         return path
 
-    def write_label_index(self, commit_sha, json_data, report_code=None) -> str:
-        label_index_file_name = (
-            report_code + "_" if report_code is not None else ""
-        ) + "labels_index"
-        path = MinioEndpoints.label_index.get_path(
-            version="v4",
-            repo_hash=self.storage_hash,
-            commitid=commit_sha,
-            label_index_file_name=label_index_file_name,
-        )
-        string_data = json.dumps(json_data)
-        self.write_file(path, string_data)
-        return path
-
     """
     Convenience method to write a chunks.txt file to storage.
     """
@@ -304,22 +287,6 @@ def read_chunks(self, commit_sha, report_code=None) -> str:
 
         return self.read_file(path).decode(errors="replace")
 
-    def read_label_index(self, commit_sha, report_code=None) -> Dict[str, str]:
-        label_index_file_name = (
-            report_code + "_" if report_code is not None else ""
-        ) + "labels_index"
-        path = MinioEndpoints.label_index.get_path(
-            version="v4",
-            repo_hash=self.storage_hash,
-            commitid=commit_sha,
-            label_index_file_name=label_index_file_name,
-        )
-
-        try:
-            return json.loads(self.read_file(path).decode(errors="replace"))
-        except FileNotInStorageError:
-            return dict()
-
     """
     Delete a chunk file from the archive
     """

diff --git a/services/report/__init__.py b/services/report/__init__.py
@@ -1,3 +1,4 @@
+import copy
 import itertools
 import logging
 import sys
@@ -38,7 +39,7 @@
     ReportExpiredException,
     RepositoryWithoutValidBotError,
 )
-from helpers.labels import get_all_report_labels, get_labels_per_session
+from helpers.labels import get_labels_per_session
 from services.archive import ArchiveService
 from services.report.parser import get_proper_parser
 from services.report.parser.types import ParsedRawReport
@@ -719,6 +720,17 @@ async def create_new_report_for_commit(self, commit: Commit) -> Report:
                 paths_to_carryforward,
                 session_extras=dict(carriedforward_from=parent_commit.commitid),
             )
+            # If the parent report has labels we also need to carryforward the label index
+            # Considerations:
+            #   1. It's necessary for labels flags to be carryforward, so it's ok to carryforward the entire index
+            #   2. As tests are renamed the index might start to be filled with stale labels. This is not good.
+            #      but I'm unsure if we should try to clean it up at this point. Cleaning it up requires going through
+            #      all lines of the report. It will be handled by a dedicated task that is encoded by the UploadFinisher
+            #   3. We deepcopy the header so we can change them independently
+            #   4. The parent_commit always uses the default report to carryforward (i.e. report_code for parent_commit is None)
+            # parent_commit and commit should belong to the same repository
+            carryforward_report.header = copy.deepcopy(parent_report.header)
+
             await self._possibly_shift_carryforward_report(
                 carryforward_report, parent_commit, commit
             )

diff --git a/services/report/labels_index.py b/services/report/labels_index.py
diff --git a/services/report/languages/pycoverage.py b/services/report/languages/pycoverage.py
@@ -1,4 +1,4 @@
-import typing
+from typing import Any, Dict, List, Optional, Union
 
 from shared.reports.resources import Report
 
@@ -22,23 +22,54 @@ def matches_content(self, content, first_line, name) -> bool:
             and "files" in content
         )
 
-    def _convert_testname_to_label(self, testname, labels_table):
+    def _normalize_label(self, testname) -> str:
         if type(testname) == int or type(testname) == float:
             # This is from a compressed report.
             # Pull label from the labels_table
             # But the labels_table keys are strings, because of JSON format
-            testname = labels_table[str(testname)]
+            testname = self.labels_table[str(testname)]
         if testname == "":
-            return SpecialLabelsEnum.CODECOV_ALL_LABELS_PLACEHOLDER
+            return SpecialLabelsEnum.CODECOV_ALL_LABELS_PLACEHOLDER.corresponding_label
         return testname.split("|", 1)[0]
 
-    def process(
-        self, name: str, content: typing.Any, report_builder: ReportBuilder
-    ) -> Report:
+    def _get_list_of_label_ids(
+        self,
+        current_label_idx: Optional[Dict[int, str]],
+        line_contexts: List[Union[str, int]] = None,
+    ) -> List[int]:
+        if self.are_labels_already_encoded:
+            # The line contexts already include indexes in the table.
+            # We can re-use the table and don't have to do anything with contexts.
+            return sorted(map(int, line_contexts))
+
+        # In this case we do need to fix the labels
+        label_ids_for_line = set()
+        for testname in line_contexts:
+            clean_label = self._normalize_label(testname)
+            if clean_label in self.reverse_table:
+                label_ids_for_line.add(self.reverse_table[clean_label])
+            else:
+                label_id = max([*current_label_idx.keys(), 0]) + 1
+                current_label_idx[label_id] = clean_label
+                self.reverse_table[clean_label] = label_id
+                label_ids_for_line.add(label_id)
+
+        return sorted(label_ids_for_line)
+
+    def process(self, name: str, content: Any, report_builder: ReportBuilder) -> Report:
         report_builder_session = report_builder.create_report_builder_session(name)
-        labels_table = None
+        # Compressed pycoverage files will include a labels_table
+        # Mapping label_idx: int --> label: str
+        self.labels_table: Dict[int, str] = None
+        self.reverse_table = {}
+        self.are_labels_already_encoded = False
         if "labels_table" in content:
-            labels_table = content["labels_table"]
+            self.labels_table = content["labels_table"]
+            # We can pre-populate some of the indexes that will be used
+            for idx, testname in self.labels_table.items():
+                clean_label = self._normalize_label(testname)
+                report_builder_session.label_index[int(idx)] = clean_label
+            self.are_labels_already_encoded = True
         for filename, file_coverage in content["files"].items():
             fixed_filename = report_builder.path_fixer(filename)
             if fixed_filename:
@@ -47,12 +78,21 @@ def process(
                     (COVERAGE_HIT, ln) for ln in file_coverage["executed_lines"]
                 ] + [(COVERAGE_MISS, ln) for ln in file_coverage["missing_lines"]]
                 for cov, ln in lines_and_coverage:
-                    label_list_of_lists = [
-                        [self._convert_testname_to_label(testname, labels_table)]
-                        for testname in file_coverage.get("contexts", {}).get(
-                            str(ln), []
-                        )
-                    ]
+                    if report_builder_session.should_use_label_index:
+                        label_list_of_lists = [
+                            [single_id]
+                            for single_id in self._get_list_of_label_ids(
+                                report_builder_session.label_index,
+                                file_coverage.get("contexts", {}).get(str(ln), []),
+                            )
+                        ]
+                    else:
+                        label_list_of_lists = [
+                            [self._normalize_label(testname)]
+                            for testname in file_coverage.get("contexts", {}).get(
+                                str(ln), []
+                            )
+                        ]
                     if ln > 0:
                         report_file.append(
                             ln,
@@ -64,4 +104,7 @@ def process(
                             ),
                         )
                 report_builder_session.append(report_file)
+        # We don't need these anymore, so let them be removed by the garbage collector
+        self.reverse_table = None
+        self.labels_table = None
         return report_builder_session.output_report()
diff --git a/services/report/languages/tests/unit/node/node1-result.json b/services/report/languages/tests/unit/node/node1-result.json