From 22be527ab32728959e681aa3f024179b423b7577 Mon Sep 17 00:00:00 2001
From: Gguidini <giovanni@codecov.io>
Date: Mon, 18 Mar 2024 15:58:37 +0100
Subject: [PATCH] fix: don't eager merge sessions into report

context: https://l.codecov.dev/xvGZaM

Because we eagerly merge sessions when processing a report the following scenario can happen:
1. You upload an empty report with a flag set to carryforward
2. There will be a new session for that upload (even though it's empty)
3. In the next commit the session will be carried forward

If no real upload was made to clean up old carriedforward sessions from empty uploads
they would accumulate in the report (because `_adjust_sessions` only runs *after* the
processing is done, and *doesn't run* if it's an empty upload). What prompted this
investigation is that a customer was fund with thousands of carryforward session in
some of their commits.

So I'm exploiting the fact that merges into the report need to be made from within a lock
and getting the ID for the session _without_ actually adding it to the original report.

If the report is successful we add it and run `_adjust_sessions`. If the report was empty,
we just never add it to the report, as if it didn't existed.

OBS.: Even though I use the litle exploit I believe that passing the `sessionid` to
`ReportBuilder` is unecessary. We can likely not get a sessionid at all when processin the report
and it would still work. I decided to leave this for it's own change tho.

Also I decided to bump the severity of the messages that indicate a report is empty.
Users are not expected to upload empty reports, so it's at least sensible it should be
a warning.
---
 services/report/__init__.py                |  2 +-
 services/report/raw_upload_processor.py    | 10 ++-
 services/report/report_processor.py        |  2 +-
 services/report/tests/unit/test_process.py | 96 ++++++++++++++--------
 4 files changed, 71 insertions(+), 39 deletions(-)

diff --git a/services/report/__init__.py b/services/report/__init__.py
index 8290da8e3..1093ef061 100644
--- a/services/report/__init__.py
+++ b/services/report/__init__.py
@@ -893,7 +893,7 @@ def build_report_from_raw_content(
                 upload_obj=upload,
             )
         except ReportEmptyError:
-            log.info(
+            log.warning(
                 "Report %s is empty",
                 reportid,
                 extra=dict(repoid=commit.repoid, commit=commit.commitid),
diff --git a/services/report/raw_upload_processor.py b/services/report/raw_upload_processor.py
index b93ca2797..795c4d20c 100644
--- a/services/report/raw_upload_processor.py
+++ b/services/report/raw_upload_processor.py
@@ -90,10 +90,14 @@ def process_raw_upload(
     else:
         ignored_file_lines = None
 
+    session = session or Session()
     # Get a sesisonid to merge into
     # anything merged into the original_report
     # will take on this sessionid
-    sessionid, session = original_report.add_session(session or Session())
+    # But we don't actually merge yet in case the report is empty.
+    # This is done to avoid garbage sessions to build up in the report
+    # How can you be sure this will be the sessionid used when you actually merge it? Remember that this piece of code runs inside a lock u.u
+    sessionid = original_report.next_session_number()
     session.id = sessionid
     if env:
         session.env = dict([e.split("=", 1) for e in env.split("\n") if "=" in e])
@@ -182,6 +186,10 @@ def process_raw_upload(
         # none of the reports processed contributed any new labels to it.
         # So we assume there are no labels and just reset the _labels_index of temporary_report
         temporary_report.labels_index = None
+    # Now we actually add the session to the original_report
+    # Because we know that the processing was successful
+    sessionid, session = original_report.add_session(session)
+    # Adjust sessions removed carryforward sessions that are being replaced
     session_manipulation_result = _adjust_sessions(
         original_report,
         temporary_report,
diff --git a/services/report/report_processor.py b/services/report/report_processor.py
index 5d9d835b7..ae727a71a 100644
--- a/services/report/report_processor.py
+++ b/services/report/report_processor.py
@@ -172,7 +172,7 @@ def process_report(
                         f"worker.services.report.processors.{processor.name}.failure"
                     )
                     raise
-    log.info(
+    log.warning(
         "File format could not be recognized",
         extra=dict(
             report_filename=name, first_line=first_line[:100], report_type=report_type
diff --git a/services/report/tests/unit/test_process.py b/services/report/tests/unit/test_process.py
index b2634c6bb..425fb05d6 100644
--- a/services/report/tests/unit/test_process.py
+++ b/services/report/tests/unit/test_process.py
@@ -197,8 +197,8 @@ def test_process_raw_upload_empty_report(self):
                 session=Session(flags=["fruits"]),
                 flags=[],
             )
-        assert len(original_report.sessions) == 2
-        assert sorted(original_report.flags.keys()) == ["fruits", "unit"]
+        assert len(original_report.sessions) == 1
+        assert sorted(original_report.flags.keys()) == ["unit"]
         assert original_report.files == ["file_1.go", "file_2.py"]
         assert original_report.flags["unit"].totals == ReportTotals(
             files=2,
@@ -215,21 +215,7 @@ def test_process_raw_upload_empty_report(self):
             complexity_total=0,
             diff=0,
         )
-        assert original_report.flags["fruits"].totals == ReportTotals(
-            files=0,
-            lines=0,
-            hits=0,
-            misses=0,
-            partials=0,
-            coverage=None,
-            branches=0,
-            methods=0,
-            messages=0,
-            sessions=1,
-            complexity=0,
-            complexity_total=0,
-            diff=0,
-        )
+        assert original_report.flags.get("fruits") is None
         general_totals, json_data = original_report.to_database()
         assert general_totals == {
             "f": 2,
@@ -241,7 +227,7 @@ def test_process_raw_upload_empty_report(self):
             "b": 1,
             "d": 0,
             "M": 0,
-            "s": 2,
+            "s": 1,
             "C": 10,
             "N": 2,
             "diff": None,
@@ -279,22 +265,7 @@ def test_process_raw_upload_empty_report(self):
                     "e": None,
                     "st": "uploaded",
                     "se": {},
-                },
-                "1": {
-                    "t": None,
-                    "d": None,
-                    "a": None,
-                    "f": ["fruits"],
-                    "c": None,
-                    "n": None,
-                    "N": None,
-                    "j": None,
-                    "u": None,
-                    "p": None,
-                    "e": None,
-                    "st": "uploaded",
-                    "se": {},
-                },
+                }
             },
         }
 
@@ -1015,9 +986,8 @@ def test_process_raw_upload_multiple_raw_reports(self, mocker):
 
 
 class TestProcessRawUploadCarryforwardFlags(BaseTestCase):
-    def test_process_raw_upload_with_carryforwarded_flags(self):
+    def _generate_sample_report(self) -> EditableReport:
         original_report = EditableReport()
-        upload_flags = ["somethingold", "flag_two"]
         first_file = EditableReportFile("banana.py")
         first_file.append(100, ReportLine.create(1, sessions=[LineSession(0, 1)]))
         first_file.append(200, ReportLine.create(0, sessions=[LineSession(0, 0)]))
@@ -1062,6 +1032,11 @@ def test_process_raw_upload_with_carryforwarded_flags(self):
             ],
             "third.c": [(2, 1, None, [[1, 1, None, None, None]], None, None)],
         }
+        return original_report
+
+    def test_process_raw_upload_with_carryforwarded_flags(self):
+        original_report = self._generate_sample_report()
+        upload_flags = ["somethingold", "flag_two"]
         raw_content = dumps(
             {
                 "coverage": {
@@ -1229,3 +1204,52 @@ def test_process_raw_upload_with_carryforwarded_flags(self):
                 (1, 1, None, [[session.id, 1, None, None, None]], None, None)
             ],
         }
+
+    def test_process_raw_upload_carryforward_flag_empty_upload(self):
+        """Tests that an upload with carryforward flag doesn't create a new session
+        in the original report for the flag it was uploaded with.
+
+        That is, a new session is created in the commit's report only if the upload
+        was properly processed
+        """
+        original_report = self._generate_sample_report()
+        original_sessions = original_report.sessions
+        expected_sessions = {
+            0: Session(flags=["super"]),
+            1: Session(flags=["somethingold"], session_type=SessionType.carriedforward),
+        }
+        assert isinstance(original_sessions, dict)
+        assert list(original_sessions.keys()) == [0, 1]
+        assert original_sessions == expected_sessions
+        empty_uploaded_report = LegacyParsedRawReport(
+            toc=None,
+            env=None,
+            report_fixes=None,
+            uploaded_files=[
+                ParsedUploadedReportFile(
+                    filename="/Users/path/to/app.coverage.json",
+                    file_contents=BytesIO(
+                        "Bogus report that shouldn't match any parser".encode()
+                    ),
+                ),
+            ],
+        )
+        session = Session(flags=["somethingold"])
+        with pytest.raises(ReportEmptyError):
+            process.process_raw_upload(
+                UserYaml(
+                    {
+                        "flag_management": {
+                            "individual_flags": [
+                                {"name": "somethingold", "carryforward": True}
+                            ]
+                        }
+                    }
+                ),
+                original_report,
+                empty_uploaded_report,
+                ["somethingold"],
+                session=session,
+            )
+        assert list(original_report.sessions.keys()) == [0, 1]
+        assert original_report.sessions == expected_sessions