-
Notifications
You must be signed in to change notification settings - Fork 557
ci: Dataset check on new PR #3103
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
85c4d0f
1ab9ddb
f0a212a
57b5d37
8275f8f
d2314e7
f15acf8
de16b26
f20082c
68b5d80
5b32cf1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| name: Datasets available on HuggingFace - PR | ||
|
|
||
| on: | ||
| pull_request: | ||
| paths: | ||
| - "mteb/tasks/**.py" | ||
|
|
||
| jobs: | ||
| run-pr-datasets-loading-check: | ||
| runs-on: ubuntu-latest | ||
|
|
||
| steps: | ||
| - name: Checkout repository | ||
| uses: actions/checkout@v4 | ||
|
|
||
| - name: Set up Python | ||
| uses: actions/setup-python@v4 | ||
| with: | ||
| python-version: '3.11' | ||
| cache: 'pip' | ||
|
|
||
| - name: Install dependencies | ||
| run: | | ||
| make install-for-tests | ||
| - name: Run dataset loading tests | ||
| run: | | ||
| make dataset-load-test-pr BASE_BRANCH=${{ github.event.pull_request.base.ref }} |
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,130 @@ | ||||||
| from __future__ import annotations | ||||||
|
|
||||||
| import argparse | ||||||
| import ast | ||||||
| import logging | ||||||
| import os | ||||||
|
|
||||||
| from .extract_model_names import get_changed_files | ||||||
isaac-chung marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
|
||||||
| logging.basicConfig(level=logging.INFO) | ||||||
isaac-chung marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
|
||||||
|
|
||||||
| def extract_datasets(files: list[str]) -> list[tuple[str, str]]: | ||||||
| """Extract dataset (path, revision) tuples from task class files.""" | ||||||
| datasets = [] | ||||||
|
|
||||||
| for file in files: | ||||||
| with open(file) as f: | ||||||
isaac-chung marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
| try: | ||||||
| tree = ast.parse(f.read()) | ||||||
| for node in ast.walk(tree): | ||||||
| # Look for class definitions (task classes) | ||||||
| if isinstance(node, ast.ClassDef): | ||||||
| # Check if it's a task class by looking for TaskMetadata assignment | ||||||
| for class_node in ast.walk(node): | ||||||
| if isinstance(class_node, ast.Assign): | ||||||
| for target in class_node.targets: | ||||||
| if ( | ||||||
| isinstance(target, ast.Name) | ||||||
| and target.id == "metadata" | ||||||
| and isinstance(class_node.value, ast.Call) | ||||||
| ): | ||||||
| # Extract dataset info from TaskMetadata | ||||||
| dataset_info = extract_dataset_from_metadata( | ||||||
| class_node.value | ||||||
| ) | ||||||
| if dataset_info: | ||||||
| datasets.append(dataset_info) | ||||||
|
|
||||||
| # Also look for direct dataset dictionary assignments | ||||||
| elif isinstance(node, ast.Assign): | ||||||
| for target in node.targets: | ||||||
| if ( | ||||||
| isinstance(target, ast.Name) | ||||||
| and target.id == "dataset" | ||||||
| and isinstance(node.value, ast.Dict) | ||||||
| ): | ||||||
| dataset_info = extract_dataset_from_dict(node.value) | ||||||
| if dataset_info: | ||||||
| datasets.append(dataset_info) | ||||||
|
|
||||||
| except SyntaxError as e: | ||||||
| logging.warning(f"Could not parse {file}: {e}") | ||||||
| continue | ||||||
|
|
||||||
| # Remove duplicates while preserving order | ||||||
| unique_datasets = list(dict.fromkeys(datasets)) | ||||||
|
|
||||||
| # Set environment variable in format "path1:revision1,path2:revision2,..." | ||||||
| if unique_datasets: | ||||||
| custom_revisions = ",".join( | ||||||
| f"{path}:{revision}" for path, revision in unique_datasets | ||||||
| ) | ||||||
| os.environ["CUSTOM_DATASET_REVISIONS"] = custom_revisions | ||||||
| logging.debug(f"Set CUSTOM_DATASET_REVISIONS={custom_revisions}") | ||||||
|
|
||||||
| print(f'export CUSTOM_DATASET_REVISIONS="{custom_revisions}"') | ||||||
| return unique_datasets | ||||||
|
|
||||||
|
|
||||||
| def extract_dataset_from_metadata(call_node: ast.Call) -> tuple[str, str] | None: | ||||||
| """Extract dataset info from TaskMetadata call.""" | ||||||
| for keyword in call_node.keywords: | ||||||
| if keyword.arg == "dataset" and isinstance(keyword.value, ast.Dict): | ||||||
| return extract_dataset_from_dict(keyword.value) | ||||||
| return None | ||||||
|
|
||||||
|
|
||||||
| def extract_dataset_from_dict(dict_node: ast.Dict) -> tuple[str, str] | None: | ||||||
| """Extract path and revision from a dataset dictionary.""" | ||||||
| path = None | ||||||
| revision = None | ||||||
|
|
||||||
| for key, value in zip(dict_node.keys, dict_node.values): | ||||||
| if isinstance(key, ast.Constant) and key.value == "path": | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will this work?
Suggested change
|
||||||
| if isinstance(value, ast.Constant): | ||||||
| path = value.value | ||||||
|
Comment on lines
+86
to
+87
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe be move to parent if. Do we have cases when value is not a constant/string?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can take a look :) |
||||||
| elif isinstance(key, ast.Constant) and key.value == "revision": | ||||||
| if isinstance(value, ast.Constant): | ||||||
| revision = value.value | ||||||
| # Handle older Python versions with ast.Str | ||||||
| elif isinstance(key, ast.Str) and key.s == "path": | ||||||
| if isinstance(value, ast.Str): | ||||||
| path = value.s | ||||||
| elif isinstance(key, ast.Str) and key.s == "revision": | ||||||
| if isinstance(value, ast.Str): | ||||||
| revision = value.s | ||||||
|
|
||||||
| if path and revision: | ||||||
| return (path, revision) | ||||||
| return None | ||||||
|
|
||||||
|
|
||||||
| def parse_args(): | ||||||
| parser = argparse.ArgumentParser() | ||||||
| parser.add_argument( | ||||||
| "base_branch", | ||||||
| nargs="?", | ||||||
| default="main", | ||||||
| help="Base branch to compare changes with", | ||||||
| ) | ||||||
| return parser.parse_args() | ||||||
|
|
||||||
|
|
||||||
| if __name__ == "__main__": | ||||||
| """ | ||||||
| Extract datasets from changed task files compared to a base branch.i | ||||||
|
|
||||||
| Can pass in base branch as an argument. Defaults to 'main'. | ||||||
| e.g. python -m scripts.extract_datasets mieb | ||||||
| """ | ||||||
| logging.basicConfig(level=logging.INFO) | ||||||
|
|
||||||
| args = parse_args() | ||||||
|
|
||||||
| base_branch = args.base_branch | ||||||
| changed_files = get_changed_files(base_branch, startswith="mteb/tasks/") | ||||||
| dataset_tuples = extract_datasets(changed_files) | ||||||
|
|
||||||
| logging.debug(f"Found {len(dataset_tuples)} unique datasets.") | ||||||
Uh oh!
There was an error while loading. Please reload this page.