Ingest Test Fixtures Update PR #7577
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Ingest Test Fixtures Update PR | |
on: | |
workflow_dispatch: | |
env: | |
PYTHON_VERSION: "3.10" | |
permissions: | |
id-token: write | |
contents: read | |
jobs: | |
setup: | |
runs-on: ubuntu-latest-m | |
env: | |
NLTK_DATA: ${{ github.workspace }}/nltk_data | |
steps: | |
- uses: actions/checkout@v3 | |
- uses: ./.github/actions/base-cache | |
with: | |
python-version: ${{ env.PYTHON_VERSION }} | |
setup_ingest: | |
runs-on: ubuntu-latest | |
env: | |
NLTK_DATA: ${{ github.workspace }}/nltk_data | |
needs: [setup] | |
steps: | |
- uses: actions/checkout@v3 | |
- uses: ./.github/actions/base-ingest-cache | |
with: | |
python-version: ${{ env.PYTHON_VERSION }} | |
check-only: 'true' | |
update-fixtures-and-pr: | |
runs-on: ubuntu-latest-m | |
needs: [setup_ingest] | |
steps: | |
# actions/checkout MUST come before auth | |
- uses: 'actions/checkout@v4' | |
- name: Set up Python ${{ env.PYTHON_VERSION }} | |
uses: actions/setup-python@v5 | |
with: | |
python-version: ${{ env.PYTHON_VERSION }} | |
- name: Get full Python version | |
id: full-python-version | |
run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT | |
- name: Setup virtual environment | |
uses: ./.github/actions/base-ingest-cache | |
with: | |
python-version: ${{ env.PYTHON_VERSION }} | |
- name: Setup docker-compose | |
uses: KengoTODA/actions-setup-docker-compose@v1 | |
with: | |
version: '2.22.0' | |
- name: Update test fixtures | |
env: | |
AIRTABLE_PERSONAL_ACCESS_TOKEN: ${{ secrets.AIRTABLE_PERSONAL_ACCESS_TOKEN }} | |
BOX_APP_CONFIG: ${{ secrets.BOX_APP_CONFIG }} | |
CONFLUENCE_API_TOKEN: ${{ secrets.CONFLUENCE_API_TOKEN }} | |
CONFLUENCE_USER_EMAIL: ${{ secrets.CONFLUENCE_USER_EMAIL }} | |
DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }} | |
DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }} | |
DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }} | |
DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }} | |
GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }} | |
GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }} | |
HUBSPOT_API_TOKEN: ${{ secrets.HUBSPOT_API_TOKEN }} | |
JIRA_INGEST_API_TOKEN: ${{ secrets.JIRA_INGEST_API_TOKEN }} | |
JIRA_INGEST_USER_EMAIL: ${{ secrets.JIRA_INGEST_USER_EMAIL }} | |
MONGODB_URI: ${{ secrets.MONGODB_URI }} | |
MONGODB_DATABASE_NAME: ${{ secrets.MONGODB_DATABASE_NAME }} | |
MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }} | |
MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }} | |
MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }} | |
MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }} | |
MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }} | |
SALESFORCE_USERNAME: ${{secrets.SALESFORCE_USERNAME}} | |
SALESFORCE_CONSUMER_KEY: ${{secrets.SALESFORCE_CONSUMER_KEY}} | |
SALESFORCE_PRIVATE_KEY: ${{secrets.SALESFORCE_PRIVATE_KEY}} | |
SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}} | |
SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}} | |
SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}} | |
SHAREPOINT_PERMISSIONS_APP_ID: ${{secrets.SHAREPOINT_PERMISSIONS_APP_ID}} | |
SHAREPOINT_PERMISSIONS_APP_CRED: ${{secrets.SHAREPOINT_PERMISSIONS_APP_CRED}} | |
SHAREPOINT_PERMISSIONS_TENANT: ${{secrets.SHAREPOINT_PERMISSIONS_TENANT}} | |
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} | |
UNS_API_KEY: ${{ secrets.UNS_API_KEY }} | |
NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }} | |
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }} | |
AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }} | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
OCTOAI_API_KEY: ${{ secrets.OCTOAI_API_KEY }} | |
ASTRA_DB_APPLICATION_TOKEN: ${{secrets.ASTRA_DB_TOKEN}} | |
ASTRA_DB_API_ENDPOINT: ${{secrets.ASTRA_DB_ENDPOINT}} | |
OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract" | |
OVERWRITE_FIXTURES: "true" | |
CI: "true" | |
run: | | |
source .venv/bin/activate | |
sudo apt-get update | |
sudo apt-get install -y libmagic-dev poppler-utils libreoffice | |
make install-pandoc | |
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 | |
sudo apt-get install -y tesseract-ocr | |
sudo apt-get install -y tesseract-ocr-kor | |
sudo apt-get install diffstat | |
tesseract --version | |
./test_unstructured_ingest/test-ingest-src.sh | |
- name: Save branch name to environment file | |
id: branch | |
run: | | |
original_branch=$(git rev-parse --abbrev-ref HEAD) | |
suffix="|ingest-test-fixtures-update-$(git rev-parse --short HEAD)" | |
branch_name="$original_branch$suffix" | |
echo "BRANCH_NAME=$branch_name" >> $GITHUB_ENV | |
- name: Save PR name to environment file | |
id: pr | |
run: | | |
commit_sha=$(git rev-parse HEAD) | |
prs=$(curl -s -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \ | |
"https://api.github.com/repos/${{ github.repository }}/commits/${commit_sha}/pulls") | |
pr_name=$(echo "$prs" | jq -r '.[0].title') | |
echo "PR_NAME=$pr_name" >> $GITHUB_ENV | |
- name: Create Pull Request | |
uses: peter-evans/create-pull-request@v5 | |
with: | |
token: ${{ secrets.GH_CREATE_PR_TOKEN }} | |
add-paths: | | |
test_unstructured_ingest/expected-structured-output | |
test_unstructured_ingest/metrics | |
commit-message: "Update ingest test fixtures" | |
branch: ${{ env.BRANCH_NAME }} | |
title: "${{ env.PR_NAME }} <- Ingest test fixtures update" | |
assignees: ${{ github.actor }} | |
reviewers: ${{ github.actor }} | |
delete-branch: true | |
body: | | |
This pull request includes updated ingest test fixtures. | |
Please review and merge if appropriate. | |
base: ${{ github.head_ref }} |