From 2d71ceaf0d3b8960906da80aa467e8819fbd30f8 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Fri, 17 May 2024 13:52:34 -0700 Subject: [PATCH] Support news-crawl importer (#608) --- utils/find_corpus.py | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/utils/find_corpus.py b/utils/find_corpus.py index b6b16d099..47f4484e9 100755 --- a/utils/find_corpus.py +++ b/utils/find_corpus.py @@ -305,14 +305,15 @@ def get_huggingface_any(language: str): ) -def get_remote_file_size(url: str) -> Optional[int]: +def get_remote_file_size(url: str, display_not_200: bool = True) -> Optional[int]: try: response = requests.head(url, timeout=1) if response.status_code == 200: return humanize.naturalsize(int(response.headers.get("Content-Length", 0))) else: - print(f"Failed to retrieve file information. Status code: {response.status_code}") + if display_not_200: + print(f"Failed to retrieve file information. Status code: {response.status_code}") return None except requests.exceptions.RequestException as e: print(f"An error occurred: {e}") @@ -389,6 +390,37 @@ def get_name(entry): print_yaml(names, exclude=excludes) +def get_news_crawl(source: str, target: str): + for lang in (source, target): + datasets = [] + for i in range(20): + year = 2007 + i + name = f"news-crawl_news.{year}" + url = ( + f"https://data.statmt.org/news-crawl/{lang}/news.{year}.{lang}.shuffled.deduped.gz" + ) + size = get_remote_file_size(url, display_not_200=False) + if size is not None: + datasets.append((name, url, size)) + + print("") + print("┌─────────────────────────────────────────────────────────────────────┐") + print(f"│ news-crawl ({lang}) - https://github.com/data.statmt.org/news-crawl │") + print("└─────────────────────────────────────────────────────────────────────┘") + print_table( + [ + [ + "Dataset", + "URL", + "Size", + ], + *[[name, url, size] for name, url, size in datasets], + ] + ) + + print_yaml([name for name, _, _ in datasets]) + + def print_yaml(names: list[str], exclude: list[str] = []): cleaned = set() for name in names: @@ -441,6 +473,7 @@ def main(args: Optional[list[str]] = None) -> None: "huggingface_mono", "huggingface_parallel", "huggingface_any", + "news-crawl", ] parser = argparse.ArgumentParser( description=__doc__, @@ -488,6 +521,9 @@ def main(args: Optional[list[str]] = None) -> None: if args.importer == "huggingface_any" or not args.importer: get_huggingface_any(args.target if args.source == "en" else args.source) + if args.importer == "news-crawl" or not args.importer: + get_news_crawl(args.source, args.target) + if __name__ == "__main__": main()