Skip to content

Commit

Permalink
Merge pull request #200 from codecov/gio/label-compressed/report-with…
Browse files Browse the repository at this point in the history
…-label-index

Use encoded labels in reports with them
  • Loading branch information
giovanni-guidini committed Dec 6, 2023
2 parents 58c31fa + ec947a4 commit 455f7e4
Show file tree
Hide file tree
Showing 27 changed files with 4,365 additions and 382 deletions.
38 changes: 29 additions & 9 deletions helpers/labels.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from enum import Enum
from typing import Set, Union

import sentry_sdk
from shared.reports.resources import Report
Expand All @@ -25,34 +26,53 @@


class SpecialLabelsEnum(Enum):
CODECOV_ALL_LABELS_PLACEHOLDER = "Th2dMtk4M_codecov"
CODECOV_ALL_LABELS_PLACEHOLDER = ("Th2dMtk4M_codecov", 0)

def __init__(self, val):
self.corresponding_label = val
def __init__(self, label, index):
self.corresponding_label = label
self.corresponding_index = index


@sentry_sdk.trace
def get_labels_per_session(report: Report, sess_id: int):
def get_labels_per_session(report: Report, sess_id: int) -> Union[Set[str], Set[int]]:
"""Returns a Set with the labels present in a session from report, EXCLUDING the SpecialLabel.
The return value can either be a set of strings (the labels themselves) OR
a set of ints (the label indexes). The label can be looked up from the index
using Report.lookup_label_by_id(label_id) (assuming Report._labels_idx is set)
"""
all_labels = set()
for rf in report:
for _, line in rf.lines:
if line.datapoints:
for datapoint in line.datapoints:
if datapoint.sessionid == sess_id:
all_labels.update(datapoint.labels or [])
all_labels.update(datapoint.label_ids or [])
return all_labels - set(
[SpecialLabelsEnum.CODECOV_ALL_LABELS_PLACEHOLDER.corresponding_label]
[
SpecialLabelsEnum.CODECOV_ALL_LABELS_PLACEHOLDER.corresponding_label,
SpecialLabelsEnum.CODECOV_ALL_LABELS_PLACEHOLDER.corresponding_index,
]
)


@sentry_sdk.trace
def get_all_report_labels(report: Report) -> set:
def get_all_report_labels(report: Report) -> Union[Set[str], Set[int]]:
"""Returns a Set with the labels present in report EXCLUDING the SpecialLabel.
The return value can either be a set of strings (the labels themselves) OR
a set of ints (the label indexes). The label can be looked up from the index
using Report.lookup_label_by_id(label_id) (assuming Report._labels_idx is set)
"""
all_labels = set()
for rf in report:
for _, line in rf.lines:
if line.datapoints:
for datapoint in line.datapoints:
all_labels.update(datapoint.labels or [])
all_labels.update(datapoint.label_ids or [])
return all_labels - set(
[SpecialLabelsEnum.CODECOV_ALL_LABELS_PLACEHOLDER.corresponding_label]
[
SpecialLabelsEnum.CODECOV_ALL_LABELS_PLACEHOLDER.corresponding_label,
SpecialLabelsEnum.CODECOV_ALL_LABELS_PLACEHOLDER.corresponding_index,
]
)
2 changes: 1 addition & 1 deletion requirements.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
https://github.com/codecov/shared/archive/9246e757c553be8b0e9eff28f038308914ec925c.tar.gz#egg=shared
https://github.com/codecov/shared/archive/954a9ba17722f73f86f3d3bb3367f80d887fc800.tar.gz#egg=shared
https://github.com/codecov/opentelem-python/archive/refs/tags/v0.0.4a1.tar.gz#egg=codecovopentelem
boto3
celery
Expand Down
12 changes: 11 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ anyio==3.6.1
# via
# httpcore
# openai
asgiref==3.7.2
# via django
async-timeout==4.0.2
# via redis
attrs==20.3.0
Expand Down Expand Up @@ -86,6 +88,8 @@ distlib==0.3.7
# via virtualenv
distro==1.8.0
# via openai
django==5.0
# via shared
ecdsa==0.16.1
# via tlslite-ng
exceptiongroup==1.1.3
Expand Down Expand Up @@ -161,6 +165,8 @@ idna==2.10
# requests
# rfc3986
# yarl
ijson==3.2.3
# via shared
importlib-metadata==6.8.0
# via opentelemetry-api
iniconfig==1.1.1
Expand Down Expand Up @@ -320,7 +326,7 @@ s3transfer==0.3.4
# via boto3
sentry-sdk==1.19.1
# via -r requirements.in
shared @ https://github.com/codecov/shared/archive/9246e757c553be8b0e9eff28f038308914ec925c.tar.gz
shared @ https://github.com/codecov/shared/archive/954a9ba17722f73f86f3d3bb3367f80d887fc800.tar.gz
# via -r requirements.in
six==1.15.0
# via
Expand All @@ -340,11 +346,14 @@ sqlalchemy==1.3.23
# via
# -r requirements.in
# pytest-sqlalchemy
# shared
# sqlalchemy-utils
sqlalchemy-utils==0.36.8
# via
# -r requirements.in
# pytest-sqlalchemy
sqlparse==0.4.4
# via django
statsd==3.3.0
# via
# -r requirements.in
Expand All @@ -365,6 +374,7 @@ typing==3.7.4.3
# via shared
typing-extensions==4.6.3
# via
# asgiref
# openai
# opentelemetry-sdk
# pydantic
Expand Down
5 changes: 4 additions & 1 deletion rollouts/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,8 @@ def repo_slug(repo: Repository) -> str:
USE_LABEL_INDEX_IN_REPORT_PROCESSING_BY_REPO_SLUG = Feature(
"use_label_index_in_report_processing",
0.0,
overrides={"github/giovanni-guidini/sentry": "enabled"},
overrides={
"github/giovanni-guidini/sentry": "enabled",
"github/giovanni-guidini/components-demo": "enabled",
},
)
35 changes: 1 addition & 34 deletions services/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from datetime import datetime
from enum import Enum
from hashlib import md5
from typing import Dict
from typing import Dict, Optional
from uuid import uuid4

from shared.config import get_config
Expand All @@ -20,9 +20,6 @@

class MinioEndpoints(Enum):
chunks = "{version}/repos/{repo_hash}/commits/{commitid}/{chunks_file_name}.txt"
label_index = (
"{version}/repos/{repo_hash}/commits/{commitid}/{label_index_file_name}.json"
)
json_data = "{version}/repos/{repo_hash}/commits/{commitid}/json_data/{table}/{field}/{external_id}.json"
json_data_no_commit = (
"{version}/repos/{repo_hash}/json_data/{table}/{field}/{external_id}.json"
Expand Down Expand Up @@ -230,20 +227,6 @@ def write_json_data_to_storage(
self.write_file(path, stringified_data)
return path

def write_label_index(self, commit_sha, json_data, report_code=None) -> str:
label_index_file_name = (
report_code + "_" if report_code is not None else ""
) + "labels_index"
path = MinioEndpoints.label_index.get_path(
version="v4",
repo_hash=self.storage_hash,
commitid=commit_sha,
label_index_file_name=label_index_file_name,
)
string_data = json.dumps(json_data)
self.write_file(path, string_data)
return path

"""
Convenience method to write a chunks.txt file to storage.
"""
Expand Down Expand Up @@ -304,22 +287,6 @@ def read_chunks(self, commit_sha, report_code=None) -> str:

return self.read_file(path).decode(errors="replace")

def read_label_index(self, commit_sha, report_code=None) -> Dict[str, str]:
label_index_file_name = (
report_code + "_" if report_code is not None else ""
) + "labels_index"
path = MinioEndpoints.label_index.get_path(
version="v4",
repo_hash=self.storage_hash,
commitid=commit_sha,
label_index_file_name=label_index_file_name,
)

try:
return json.loads(self.read_file(path).decode(errors="replace"))
except FileNotInStorageError:
return dict()

"""
Delete a chunk file from the archive
"""
Expand Down
14 changes: 13 additions & 1 deletion services/report/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import copy
import itertools
import logging
import sys
Expand Down Expand Up @@ -38,7 +39,7 @@
ReportExpiredException,
RepositoryWithoutValidBotError,
)
from helpers.labels import get_all_report_labels, get_labels_per_session
from helpers.labels import get_labels_per_session
from services.archive import ArchiveService
from services.report.parser import get_proper_parser
from services.report.parser.types import ParsedRawReport
Expand Down Expand Up @@ -719,6 +720,17 @@ async def create_new_report_for_commit(self, commit: Commit) -> Report:
paths_to_carryforward,
session_extras=dict(carriedforward_from=parent_commit.commitid),
)
# If the parent report has labels we also need to carryforward the label index
# Considerations:
# 1. It's necessary for labels flags to be carryforward, so it's ok to carryforward the entire index
# 2. As tests are renamed the index might start to be filled with stale labels. This is not good.
# but I'm unsure if we should try to clean it up at this point. Cleaning it up requires going through
# all lines of the report. It will be handled by a dedicated task that is encoded by the UploadFinisher
# 3. We deepcopy the header so we can change them independently
# 4. The parent_commit always uses the default report to carryforward (i.e. report_code for parent_commit is None)
# parent_commit and commit should belong to the same repository
carryforward_report.header = copy.deepcopy(parent_report.header)

await self._possibly_shift_carryforward_report(
carryforward_report, parent_commit, commit
)
Expand Down
46 changes: 0 additions & 46 deletions services/report/labels_index.py

This file was deleted.

73 changes: 58 additions & 15 deletions services/report/languages/pycoverage.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import typing
from typing import Any, Dict, List, Optional, Union

from shared.reports.resources import Report

Expand All @@ -22,23 +22,54 @@ def matches_content(self, content, first_line, name) -> bool:
and "files" in content
)

def _convert_testname_to_label(self, testname, labels_table):
def _normalize_label(self, testname) -> str:
if type(testname) == int or type(testname) == float:
# This is from a compressed report.
# Pull label from the labels_table
# But the labels_table keys are strings, because of JSON format
testname = labels_table[str(testname)]
testname = self.labels_table[str(testname)]
if testname == "":
return SpecialLabelsEnum.CODECOV_ALL_LABELS_PLACEHOLDER
return SpecialLabelsEnum.CODECOV_ALL_LABELS_PLACEHOLDER.corresponding_label
return testname.split("|", 1)[0]

def process(
self, name: str, content: typing.Any, report_builder: ReportBuilder
) -> Report:
def _get_list_of_label_ids(
self,
current_label_idx: Optional[Dict[int, str]],
line_contexts: List[Union[str, int]] = None,
) -> List[int]:
if self.are_labels_already_encoded:
# The line contexts already include indexes in the table.
# We can re-use the table and don't have to do anything with contexts.
return sorted(map(int, line_contexts))

# In this case we do need to fix the labels
label_ids_for_line = set()
for testname in line_contexts:
clean_label = self._normalize_label(testname)
if clean_label in self.reverse_table:
label_ids_for_line.add(self.reverse_table[clean_label])
else:
label_id = max([*current_label_idx.keys(), 0]) + 1
current_label_idx[label_id] = clean_label
self.reverse_table[clean_label] = label_id
label_ids_for_line.add(label_id)

return sorted(label_ids_for_line)

def process(self, name: str, content: Any, report_builder: ReportBuilder) -> Report:
report_builder_session = report_builder.create_report_builder_session(name)
labels_table = None
# Compressed pycoverage files will include a labels_table
# Mapping label_idx: int --> label: str
self.labels_table: Dict[int, str] = None
self.reverse_table = {}
self.are_labels_already_encoded = False
if "labels_table" in content:
labels_table = content["labels_table"]
self.labels_table = content["labels_table"]
# We can pre-populate some of the indexes that will be used
for idx, testname in self.labels_table.items():
clean_label = self._normalize_label(testname)
report_builder_session.label_index[int(idx)] = clean_label
self.are_labels_already_encoded = True
for filename, file_coverage in content["files"].items():
fixed_filename = report_builder.path_fixer(filename)
if fixed_filename:
Expand All @@ -47,12 +78,21 @@ def process(
(COVERAGE_HIT, ln) for ln in file_coverage["executed_lines"]
] + [(COVERAGE_MISS, ln) for ln in file_coverage["missing_lines"]]
for cov, ln in lines_and_coverage:
label_list_of_lists = [
[self._convert_testname_to_label(testname, labels_table)]
for testname in file_coverage.get("contexts", {}).get(
str(ln), []
)
]
if report_builder_session.should_use_label_index:
label_list_of_lists = [
[single_id]
for single_id in self._get_list_of_label_ids(
report_builder_session.label_index,
file_coverage.get("contexts", {}).get(str(ln), []),
)
]
else:
label_list_of_lists = [
[self._normalize_label(testname)]
for testname in file_coverage.get("contexts", {}).get(
str(ln), []
)
]
if ln > 0:
report_file.append(
ln,
Expand All @@ -64,4 +104,7 @@ def process(
),
)
report_builder_session.append(report_file)
# We don't need these anymore, so let them be removed by the garbage collector
self.reverse_table = None
self.labels_table = None
return report_builder_session.output_report()

Large diffs are not rendered by default.

Loading

0 comments on commit 455f7e4

Please sign in to comment.