Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 142 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ jobs:
determine-evals:
runs-on: ubuntu-latest
outputs:
run-combination: ${{ steps.check-labels.outputs.run-combination }}
run-extract: ${{ steps.check-labels.outputs.run-extract }}
run-act: ${{ steps.check-labels.outputs.run-act }}
run-observe: ${{ steps.check-labels.outputs.run-observe }}
Expand All @@ -31,6 +32,7 @@ jobs:
# Default to running all tests on main branch
if [[ "${{ github.ref }}" == "refs/heads/main" ]]; then
echo "Running all tests for main branch"
echo "run-combination=true" >> $GITHUB_OUTPUT
echo "run-extract=true" >> $GITHUB_OUTPUT
echo "run-act=true" >> $GITHUB_OUTPUT
echo "run-observe=true" >> $GITHUB_OUTPUT
Expand All @@ -40,6 +42,7 @@ jobs:
fi

# Check for specific labels
echo "run-combination=${{ contains(github.event.pull_request.labels.*.name, 'combination') }}" >> $GITHUB_OUTPUT
echo "run-extract=${{ contains(github.event.pull_request.labels.*.name, 'extract') }}" >> $GITHUB_OUTPUT
echo "run-act=${{ contains(github.event.pull_request.labels.*.name, 'act') }}" >> $GITHUB_OUTPUT
echo "run-observe=${{ contains(github.event.pull_request.labels.*.name, 'observe') }}" >> $GITHUB_OUTPUT
Expand Down Expand Up @@ -147,7 +150,7 @@ jobs:
run: npm run e2e:local

run-e2e-bb-tests:
needs: [run-e2e-tests]
needs: [run-lint, run-build]
runs-on: ubuntu-latest
timeout-minutes: 50
if: >
Expand Down Expand Up @@ -183,8 +186,129 @@ jobs:
- name: Run E2E Tests (browserbase)
run: npm run e2e:bb

run-regression-evals-dom-extract:
needs:
[run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals]
runs-on: ubuntu-latest
timeout-minutes: 7
outputs:
regression_dom_score: ${{ steps.set-dom-score.outputs.regression_dom_score }}
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
HEADLESS: true
EVAL_ENV: browserbase
steps:
- name: Check out repository code
uses: actions/checkout@v4

- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: "20"

- name: Install dependencies
run: |
rm -rf node_modules
rm -f package-lock.json
npm install

- name: Build Stagehand
run: npm run build

- name: Install Playwright browsers
run: npm exec playwright install --with-deps

- name: Run Regression Evals (domExtract)
run: npm run evals category regression_dom_extract trials=2 concurrency=12 env=BROWSERBASE -- --extract-method=domExtract

- name: Save Regression domExtract Results
run: mv eval-summary.json eval-summary-regression-dom.json

- name: Log and Regression (domExtract) Evals Performance
id: set-dom-score
run: |
experimentNameRegressionDom=$(jq -r '.experimentName' eval-summary-regression-dom.json)
regression_dom_score=$(jq '.categories.regression_dom_extract' eval-summary-regression-dom.json)
echo "regression_dom_extract category score: ${regression_dom_score}%"
echo "View regression_dom_extract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameRegressionDom}"
echo "regression_dom_score=$regression_dom_score" >> "$GITHUB_OUTPUT"

run-regression-evals-text-extract:
needs:
[run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals]
runs-on: ubuntu-latest
timeout-minutes: 7
outputs:
regression_text_score: ${{ steps.set-text-score.outputs.regression_text_score }}
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
HEADLESS: true
EVAL_ENV: browserbase
steps:
- name: Check out repository code
uses: actions/checkout@v4

- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: "20"

- name: Install dependencies
run: |
rm -rf node_modules
rm -f package-lock.json
npm install

- name: Build Stagehand
run: npm run build

- name: Install Playwright browsers
run: npm exec playwright install --with-deps

- name: Run Regression Evals (textExtract)
run: npm run evals category regression_text_extract trials=2 concurrency=12 env=BROWSERBASE -- --extract-method=textExtract

- name: Save Regression textExtract Results
run: mv eval-summary.json eval-summary-regression-text.json

- name: Log Regression (textExtract) Evals Performance
id: set-text-score
run: |
experimentNameRegressionText=$(jq -r '.experimentName' eval-summary-regression-text.json)
regression_text_score=$(jq '.categories.regression_text_extract' eval-summary-regression-text.json)
echo "regression_text_extract category score: ${regression_text_score}%"
echo "View regression_text_extract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameRegressionText}"
echo "regression_text_score=$regression_text_score" >> "$GITHUB_OUTPUT"

check-regression-evals-score:
needs: [run-regression-evals-text-extract, run-regression-evals-dom-extract]
runs-on: ubuntu-latest
timeout-minutes: 5
steps:
- name: Compare Overall Regression Evals Score
run: |
regression_dom_score="${{ needs.run-regression-evals-dom-extract.outputs.regression_dom_score }}"
regression_text_score="${{ needs.run-regression-evals-text-extract.outputs.regression_text_score }}"

overall_score=$(echo "(${regression_dom_score} + ${regression_text_score}) / 2" | bc -l)
echo "Overall regression score: ${overall_score}%"

# Fail if overall score is below 90%
if (( $(echo "${overall_score} < 90" | bc -l) )); then
echo "Overall regression score is below 90%. Failing CI."
exit 1
fi

run-combination-evals:
needs: [run-e2e-bb-tests, run-e2e-tests, determine-evals]
needs: [check-regression-evals-score, determine-evals]
runs-on: ubuntu-latest
timeout-minutes: 40
env:
Expand All @@ -199,27 +323,43 @@ jobs:
- name: Check out repository code
uses: actions/checkout@v4

- name: Check for 'combination' label
id: label-check
run: |
if [ "${{ needs.determine-evals.outputs.run-combination }}" != "true" ]; then
echo "has_label=false" >> $GITHUB_OUTPUT
echo "No label for COMBINATION. Exiting with success."
else
echo "has_label=true" >> $GITHUB_OUTPUT
fi

- name: Set up Node.js
if: needs.determine-evals.outputs.run-combination == 'true'
uses: actions/setup-node@v4
with:
node-version: "20"

- name: Install dependencies
if: needs.determine-evals.outputs.run-combination == 'true'
run: |
rm -rf node_modules
rm -f package-lock.json
npm install

- name: Build Stagehand
if: needs.determine-evals.outputs.run-combination == 'true'
run: npm run build

- name: Install Playwright browsers
if: needs.determine-evals.outputs.run-combination == 'true'
run: npm exec playwright install --with-deps

- name: Run Combination Evals
if: needs.determine-evals.outputs.run-combination == 'true'
run: npm run evals category combination

- name: Log Combination Evals Performance
if: needs.determine-evals.outputs.run-combination == 'true'
run: |
experimentName=$(jq -r '.experimentName' eval-summary.json)
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
Expand Down
16 changes: 8 additions & 8 deletions evals/evals.config.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
},
{
"name": "ionwave",
"categories": ["act"]
"categories": ["act", "regression_dom_extract"]
},
{
"name": "nonsense_action",
Expand Down Expand Up @@ -79,7 +79,7 @@
},
{
"name": "wichita",
"categories": ["combination"]
"categories": ["combination", "regression_dom_extract"]
},

{
Expand All @@ -104,7 +104,7 @@
},
{
"name": "extract_aigrant_companies",
"categories": ["experimental", "text_extract"]
"categories": ["experimental", "text_extract", "regression_text_extract"]
},
{
"name": "extract_capacitor_info",
Expand Down Expand Up @@ -153,7 +153,7 @@
},
{
"name": "extract_memorial_healthcare",
"categories": ["extract"]
"categories": ["extract", "regression_dom_extract"]
},
{
"name": "extract_nhl_stats",
Expand Down Expand Up @@ -222,11 +222,11 @@
},
{
"name": "observe_github",
"categories": ["observe"]
"categories": ["observe", "regression_text_extract"]
},
{
"name": "observe_vantechjournal",
"categories": ["observe"]
"categories": ["observe", "regression_text_extract"]
},
{
"name": "observe_amazon_add_to_cart",
Expand Down Expand Up @@ -254,7 +254,7 @@
},
{
"name": "extract_hamilton_weather",
"categories": ["targeted_extract"]
"categories": ["targeted_extract", "regression_text_extract"]
},
{
"name": "extract_regulations_table",
Expand Down Expand Up @@ -286,7 +286,7 @@
},
{
"name": "scroll_75",
"categories": ["act"]
"categories": ["act", "regression_dom_extract"]
}
]
}
2 changes: 2 additions & 0 deletions types/evals.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ export const EvalCategorySchema = z.enum([
"experimental",
"text_extract",
"targeted_extract",
"regression_text_extract",
"regression_dom_extract",
]);

export type EvalCategory = z.infer<typeof EvalCategorySchema>;
Expand Down