Skip to content

Commit

Permalink
Added deprecation for memmap (#517)
Browse files Browse the repository at this point in the history
Co-authored-by: Pete <[email protected]>
  • Loading branch information
soldni and epwalsh authored Mar 21, 2024
1 parent 83cc8b1 commit 8949bd8
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 1 deletion.
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ jobs:
task:
name: Data pipeline
run: |
python scripts/prepare_memmap_dataset.py test_fixtures/*.json.gz -o /tmp/c4-sample.npy --validate
python scripts/prepare_memmap_dataset.py test_fixtures/*.json.gz -o /tmp/c4-sample.npy --validate --ack-deprecated
steps:
- uses: actions/checkout@v3
Expand Down
14 changes: 14 additions & 0 deletions scripts/prepare_memmap_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,7 @@ def make_source_and_target(
"--safe-mode/--fast-mode", default=False, help="Safe mode caches locally and decompresses using gzip.open"
)
@click.option("-j", "--workers", "max_workers", type=int, default=1, help="Defaults to number of CPUs")
@click.option("--ack-deprecated", is_flag=True, help="Acknowledge that this command is deprecated")
def main(
src: Tuple[str, ...],
output: str,
Expand All @@ -389,7 +390,20 @@ def main(
paths_per_worker: int = 1,
max_workers: int = 1,
cache_dir: Optional[str] = None,
ack_deprecated: bool = False,
):
print("WARNING: THIS SCRIPT IS DEPRECATED!!!")
print(
"Consider using the tokenization tool in the Dolma toolkit: "
"https://github.com/allenai/dolma/blob/main/docs/tokenize.md"
)

if not ack_deprecated:
continue_question = input("Do you want to continue? [y/N]: ")
if not (c := continue_question.lower().strip()) or c != "y":
print("Aborting.")
return

print("=== CONFIGURATION ===")
print(f"src: {src}")
print(f"output: {output}")
Expand Down

0 comments on commit 8949bd8

Please sign in to comment.