Merge branch 'main' into tlm-update-max-tokens

cleanlab · Sep 26, 2024 · 95de12f · 95de12f
2 parents d9b66a8 + 90fb5cf
commit 95de12f
Show file tree

Hide file tree

Showing 17 changed files with 1,048 additions and 350 deletions.
diff --git a/.github/workflows/ci-rerun-failed-test-tlm.yml b/.github/workflows/ci-rerun-failed-test-tlm.yml
@@ -0,0 +1,114 @@
+name: /rerun-failed-test-tlm CI
+
+on:
+  issue_comment:
+    types: [created]
+
+jobs:
+  propertytestrerun:
+    name: "TLM Property Test: Python ${{ matrix.python }} on ${{ matrix.os }}"
+    runs-on: ${{ matrix.os }}
+    if: github.event.comment.body == '/rerun-failed-test-tlm'  # Only run if the comment is "/rerun-failed-test-tlm"
+    strategy:
+      matrix:
+        os:
+          - macos-latest
+        python:
+          - "3.11"
+    steps:
+      - name: Checkout latest commit to pull request
+        uses: actions/checkout@v3
+        with:
+          ref: refs/pull/${{ github.event.issue.number }}/head
+      - name: Set up Python ${{ matrix.python }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python }}
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install .
+          pip install -r tests/requirements_test.txt
+      - name: Restore pytest cache
+        uses: actions/cache@v2
+        with:
+          path: .pytest_cache
+          key: pytest-cache-${{ github.run_id }}
+          restore-keys: pytest-cache-
+      - name: Install Cleanlab Studio client
+        run: pip install -e .
+      - name: Set env variables (Linux/macOS/Windows)
+        run: |
+          echo "CLEANLAB_API_BASE_URL=${{ secrets.CLEANLAB_API_BASE_URL }}" >> $GITHUB_ENV
+        shell: bash
+      - name: Cleanlab login
+        run:  cleanlab login --key "${{ secrets.CLEANLAB_STUDIO_CI_API_KEY }}"
+      - name: Run tests
+        run: |
+          pytest tests/tlm/test_properties.py --last-failed --verbose
+      - name: Cache pytest results
+        uses: actions/cache@v2
+        with:
+          path: .pytest_cache
+          key: pytest-cache-${{ github.run_id }}
+  launch-tlm-test-notif:
+    name: Comment TLM test start on PR
+    runs-on: ubuntu-latest
+    if: github.event.comment.body == '/rerun-failed-test-tlm'  # Only run if the comment is "/rerun-failed-test-tlm"
+    steps:
+      - name: Log PR Number and Pull request ref
+        run: echo "PR Number is ${{ github.event.issue.number }}. PR ref is refs/pull/${{ github.event.issue.number }}/head."
+      - name: Find Comment
+        id: fc
+        uses: peter-evans/find-comment@v3
+        with:
+          issue-number: ${{ github.event.issue.number }}
+          body-regex: '^/rerun-failed-test-tlm$'
+          direction: last
+      - name: Update comment
+        uses: peter-evans/create-or-update-comment@v4
+        if: steps.fc.outputs.comment-id != ''
+        with:
+          comment-id: ${{ steps.fc.outputs.comment-id }}
+          body: |
+            :sparkles: **Starting Rerun of Failed TLM tests...** :sparkles:
+            If you want to run all the TLM tests (because TLM code is ready for review), comment '/test-tlm' on this PR.
+            If you want to re-run only the failed tests again (you are still developing), comment '/rerun-failed-test-tlm' on this PR.
+            [View full GitHub Actions run log](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})
+          reactions: rocket
+  finish-tlm-test-notif:
+    name: Comment TLM test completion on PR
+    runs-on: ubuntu-latest
+    needs:
+      - launch-tlm-test-notif
+      - propertytestrerun
+    if: always() && github.event.comment.body == '/rerun-failed-test-tlm'  # Only run if the comment is "/rerun-failed-test-tlm"
+    continue-on-error: true
+    steps:
+      - name: Find Comment
+        id: fc
+        uses: peter-evans/find-comment@v3
+        with:
+          issue-number: ${{ github.event.issue.number }}
+          body-includes: Starting Rerun of Failed TLM tests
+          direction: last
+      - name: Build Comment Body
+        run: |
+          # Set up emojis based on test results
+          if [[ "${{ needs.propertytestrerun.result }}" == "success" ]]; then
+            PROPERTY_TEST_RESULT="✅"
+          else
+            PROPERTY_TEST_RESULT="❌"
+          fi
+
+          echo "PROPERTY_TEST_RESULT=$PROPERTY_TEST_RESULT" >> $GITHUB_ENV
+      - name: Update comment
+        if: steps.fc.outputs.comment-id != ''
+        uses: peter-evans/create-or-update-comment@v4
+        with:
+          comment-id: ${{ steps.fc.outputs.comment-id }}
+          body: |
+            :sparkles: **Failed Property Test rerun completed!** :sparkles:
+            TLM Property Previously Failed Tests Results: ${{ env.PROPERTY_TEST_RESULT }}${{ env.PROPERTY_TEST_RESULT }}${{ env.PROPERTY_TEST_RESULT }}${{ env.PROPERTY_TEST_RESULT }}${{ env.PROPERTY_TEST_RESULT }}
+            **Note:** These results are only for the tests that failed on the previous run and not for all tests.
+            Click the Github Actions run log for more information.
diff --git a/.github/workflows/ci-test-tlm.yml b/.github/workflows/ci-test-tlm.yml
@@ -70,6 +70,12 @@ jobs:
           pip install --upgrade pip
           pip install .
           pip install -r tests/requirements_test.txt
+      - name: Restore pytest cache
+        uses: actions/cache@v2
+        with:
+          path: .pytest_cache
+          key: pytest-cache-${{ github.run_id }}
+          restore-keys: pytest-cache-
       - name: Install Cleanlab Studio client
         run: pip install -e .
       - name: Set env variables (Linux/macOS/Windows)
@@ -80,7 +86,12 @@ jobs:
         run:  cleanlab login --key "${{ secrets.CLEANLAB_STUDIO_CI_API_KEY }}"
       - name: Run tests
         run: |
-          pytest tests/tlm/test_properties.py --verbose
+          pytest -n auto tests/tlm/test_properties.py --verbose
+      - name: Cache pytest results
+        uses: actions/cache@v2
+        with:
+          path: .pytest_cache
+          key: pytest-cache-${{ github.run_id }}
   launch-tlm-test-notif:
     name: Comment TLM test start on PR
     runs-on: ubuntu-latest
@@ -102,7 +113,8 @@ jobs:
           comment-id: ${{ steps.fc.outputs.comment-id }}
           body: |
             :sparkles: **Starting TLM tests...** :sparkles:
-            If you want to run the TLM tests again (in case they failed or you have further updates to the TLM code), comment '/test-tlm' on this PR.
+            If you want to run all the TLM tests again (because TLM code is ready for review), comment '/test-tlm' on this PR.
+            If you want to re-run only the failed tests (you are still developing), comment '/rerun-failed-test-tlm' on this PR.
             [View full GitHub Actions run log](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})
           reactions: rocket
   finish-tlm-test-notif:
@@ -146,6 +158,6 @@ jobs:
           comment-id: ${{ steps.fc.outputs.comment-id }}
           body: |
             :sparkles: **Tests completed!** :sparkles:
-            TLM Tests Results: ${{ env.TLM_TEST_RESULT }}${{ env.TLM_TEST_RESULT }} ${{ env.TLM_TEST_RESULT }}${{ env.TLM_TEST_RESULT }}${{ env.TLM_TEST_RESULT }}
+            TLM Tests Results: ${{ env.TLM_TEST_RESULT }}${{ env.TLM_TEST_RESULT }}${{ env.TLM_TEST_RESULT }}${{ env.TLM_TEST_RESULT }}${{ env.TLM_TEST_RESULT }}
             TLM Property Tests Results: ${{ env.PROPERTY_TEST_RESULT }}${{ env.PROPERTY_TEST_RESULT }}${{ env.PROPERTY_TEST_RESULT }}${{ env.PROPERTY_TEST_RESULT }}${{ env.PROPERTY_TEST_RESULT }}
             Click the Github Actions run log for more information.
diff --git a/.github/workflows/comment-tlm.yml b/.github/workflows/comment-tlm.yml
@@ -20,5 +20,5 @@ jobs:
               issue_number: context.issue.number,
               owner: context.repo.owner,
               repo: context.repo.repo,
-              body: 'Ensure **final** changes made to the TLM code are tested before merging. To run the TLM tests, comment `/test-tlm` on this PR.'
+              body: 'Ensure **final** changes made to the TLM code are tested before merging. To run the TLM tests, comment `/test-tlm` on this PR. To re-run failed property tests, comment `/rerun-failed-test-tlm` instead.'
             })
diff --git a/cleanlab_studio/errors.py b/cleanlab_studio/errors.py
@@ -79,7 +79,9 @@ def __init__(self, message: str, retry_after: int):
 
 
 class TlmBadRequest(HandledError):
-    pass
+    def __init__(self, message: str, retryable: bool):
+        self.message = message
+        self.retryable = retryable
 
 
 class TlmServerError(APIError):
@@ -94,6 +96,10 @@ class TlmPartialSuccess(APIError):
     pass
 
 
+class TlmNotCalibratedError(HandledError):
+    pass
+
+
 class UnsupportedVersionError(HandledError):
     def __init__(self) -> None:
         super().__init__(

diff --git a/cleanlab_studio/internal/api/api.py b/cleanlab_studio/internal/api/api.py
@@ -115,13 +115,14 @@ async def handle_tlm_client_error_from_resp(
         try:
             res_json = await resp.json()
             error_message = res_json["error"]
+            retryable = False
         except Exception:
             error_message = "TLM query failed. Please try again and contact [email protected] if the problem persists."
-
+            retryable = True
         if batch_index is not None:
             error_message = f"Error executing query at index {batch_index}:\n{error_message}"
 
-        raise TlmBadRequest(error_message)
+        raise TlmBadRequest(error_message, retryable)
 
 
 async def handle_tlm_api_error_from_resp(

diff --git a/cleanlab_studio/internal/enrichment_utils.py b/cleanlab_studio/internal/enrichment_utils.py
@@ -14,7 +14,7 @@
 
 def get_prompt_outputs(
     studio: Studio, prompt: str, data: pd.DataFrame, **kwargs: Any
-) -> List[Optional[TLMResponse]]:
+) -> List[TLMResponse]:
     """Returns the outputs of the prompt for each row in the dataframe."""
     default_tlm_options = {"model": "claude-3-haiku"}
     tlm_options = kwargs.get("options", {})

diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py
@@ -29,7 +29,7 @@
     init_dataset_source,
     telemetry,
 )
-from cleanlab_studio.utils import tlm_lite
+from cleanlab_studio.utils import tlm_lite, tlm_calibrated
 
 from . import enrichment, inference, trustworthy_language_model
 
@@ -652,6 +652,27 @@ def TLMLite(
             verbose=verbose,
         )
 
+    def TLMCalibrated(
+        self,
+        quality_preset: TLMQualityPreset = "medium",
+        *,
+        options: Optional[trustworthy_language_model.TLMOptions] = None,
+        timeout: Optional[float] = None,
+        verbose: Optional[bool] = None,
+    ) -> tlm_calibrated.TLMCalibrated:
+        """
+        Instantiate a version of the Trustworthy Language Model that you can calibrate using existing ratings for example prompt-response pairs.
+        For more details, see the documentation of:
+        [cleanlab_studio.utils.tlm_calibrated.TLMCalibrated](../utils.tlm_calibrated/#class-tlmcalibrated)
+        """
+        return tlm_calibrated.TLMCalibrated(
+            self._api_key,
+            quality_preset,
+            options=options,
+            timeout=timeout,
+            verbose=verbose,
+        )
+
     def poll_cleanset_status(self, cleanset_id: str, timeout: Optional[int] = None) -> bool:
         """
         This method has been deprecated, instead use: `wait_until_cleanset_ready()`