5858 run-extract-evals :
5959 needs : [run-lint, run-build]
6060 runs-on : ubuntu-latest
61- timeout-minutes : 25
61+ timeout-minutes : 50
6262 env :
6363 OPENAI_API_KEY : ${{ secrets.OPENAI_API_KEY }}
6464 ANTHROPIC_API_KEY : ${{ secrets.ANTHROPIC_API_KEY }}
@@ -82,31 +82,32 @@ jobs:
8282 - name : Install Playwright browsers
8383 run : npm exec playwright install --with-deps
8484
85- # Run extract category with domExtract
85+ # 1. Run extract category with domExtract
8686 - name : Run Extract Evals (domExtract)
8787 run : npm run evals category extract -- --extract-method=domExtract
8888 - name : Save Extract Dom Results
8989 run : mv eval-summary.json eval-summary-extract-dom.json
9090
91- # Run extract category with textExtract
91+ # 2. Once domExtract finishes, run extract category with textExtract
9292 - name : Run Extract Evals (textExtract)
9393 run : npm run evals category extract -- --extract-method=textExtract
94- continue-on-error : true
95- # - name: Save Extract Text Results
96- # run: mv eval-summary.json eval-summary-extract-text.json
94+ - name : Save Extract Text Results
95+ run : mv eval-summary.json eval-summary-extract-text.json
9796
97+ # 3. Log and Compare Extract Evals Performance
9898 - name : Log and Compare Extract Evals Performance
9999 run : |
100100 experimentNameDom=$(jq -r '.experimentName' eval-summary-extract-dom.json)
101101 dom_score=$(jq '.categories.extract' eval-summary-extract-dom.json)
102102 echo "DomExtract Extract category score: $dom_score%"
103103 echo "View domExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameDom}"
104104
105- # experimentNameText=$(jq -r '.experimentName' eval-summary-extract-text.json)
106- # text_score=$(jq '.categories.extract' eval-summary-extract-text.json)
107- # echo "TextExtract Extract category score: $text_score%"
108- # echo "View textExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameText}"
105+ experimentNameText=$(jq -r '.experimentName' eval-summary-extract-text.json)
106+ text_score=$(jq '.categories.extract' eval-summary-extract-text.json)
107+ echo "TextExtract Extract category score: $text_score%"
108+ echo "View textExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameText}"
109109
110+ # 4. If domExtract <80% fail CI
110111 if (( $(echo "$dom_score < 80" | bc -l) )); then
111112 echo "DomExtract extract category score is below 80%. Failing CI."
112113 exit 1
@@ -115,7 +116,7 @@ jobs:
115116 run-text-extract-evals :
116117 needs : [run-extract-evals]
117118 runs-on : ubuntu-latest
118- timeout-minutes : 40
119+ timeout-minutes : 120
119120 env :
120121 OPENAI_API_KEY : ${{ secrets.OPENAI_API_KEY }}
121122 ANTHROPIC_API_KEY : ${{ secrets.ANTHROPIC_API_KEY }}
@@ -139,36 +140,36 @@ jobs:
139140 - name : Install Playwright browsers
140141 run : npm exec playwright install --with-deps
141142
142- # Run text_extract category with domExtract
143+ # 1. Run text_extract category with textExtract first
144+ - name : Run text_extract Evals (textExtract)
145+ run : npm run evals category text_extract -- --extract-method=textExtract
146+ - name : Save text_extract Text Results
147+ run : mv eval-summary.json eval-summary-text_extract-text.json
148+
149+ # 2. Then run text_extract category with domExtract
143150 - name : Run text_extract Evals (domExtract)
144151 run : npm run evals category text_extract -- --extract-method=domExtract
145152 - name : Save text_extract Dom Results
146153 run : mv eval-summary.json eval-summary-text_extract-dom.json
147154
148- # Run text_extract category with textExtract
149- - name : Run text_extract Evals (textExtract)
150- run : npm run evals category text_extract -- --extract-method=textExtract
151- continue-on-error : true
152- # - name: Save text_extract Text Results
153- # run: mv eval-summary.json eval-summary-text_extract-text.json
154-
155+ # 3. Log and Compare text_extract Evals Performance
155156 - name : Log and Compare text_extract Evals Performance
156157 run : |
158+ experimentNameText=$(jq -r '.experimentName' eval-summary-text_extract-text.json)
159+ text_score=$(jq '.categories.text_extract' eval-summary-text_extract-text.json)
160+ echo "TextExtract text_extract category score: $text_score%"
161+ echo "View textExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameText}"
162+
157163 experimentNameDom=$(jq -r '.experimentName' eval-summary-text_extract-dom.json)
158164 dom_score=$(jq '.categories.text_extract' eval-summary-text_extract-dom.json)
159165 echo "DomExtract text_extract category score: $dom_score%"
160166 echo "View domExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameDom}"
161167
162- # experimentNameText=$(jq -r '.experimentName' eval-summary-text_extract-text.json)
163- # text_score=$(jq '.categories.text_extract' eval-summary-text_extract-text.json)
164- # echo "TextExtract text_extract category score: $text_score%"
165- # echo "View textExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameText}"
166-
167- # Fail CI only if textExtract is below 80%
168- # if (( $(echo "$text_score < 80" | bc -l) )); then
169- # echo "textExtract text_extract category score is below 80%. Failing CI."
170- # exit 1
171- # fi
168+ # 4. If textExtract (for text_extract category) <80% fail CI
169+ if (( $(echo "$text_score < 80" | bc -l) )); then
170+ echo "textExtract text_extract category score is below 80%. Failing CI."
171+ exit 1
172+ fi
172173
173174 run-act-evals :
174175 runs-on : ubuntu-latest
0 commit comments