From 2d71ceaf0d3b8960906da80aa467e8819fbd30f8 Mon Sep 17 00:00:00 2001
From: Evgeny Pavlov <epavlov@mozilla.com>
Date: Fri, 17 May 2024 13:52:34 -0700
Subject: [PATCH] Support news-crawl importer (#608)

---
 utils/find_corpus.py | 40 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/utils/find_corpus.py b/utils/find_corpus.py
index b6b16d099..47f4484e9 100755
--- a/utils/find_corpus.py
+++ b/utils/find_corpus.py
@@ -305,14 +305,15 @@ def get_huggingface_any(language: str):
     )
 
 
-def get_remote_file_size(url: str) -> Optional[int]:
+def get_remote_file_size(url: str, display_not_200: bool = True) -> Optional[int]:
     try:
         response = requests.head(url, timeout=1)
 
         if response.status_code == 200:
             return humanize.naturalsize(int(response.headers.get("Content-Length", 0)))
         else:
-            print(f"Failed to retrieve file information. Status code: {response.status_code}")
+            if display_not_200:
+                print(f"Failed to retrieve file information. Status code: {response.status_code}")
             return None
     except requests.exceptions.RequestException as e:
         print(f"An error occurred: {e}")
@@ -389,6 +390,37 @@ def get_name(entry):
     print_yaml(names, exclude=excludes)
 
 
+def get_news_crawl(source: str, target: str):
+    for lang in (source, target):
+        datasets = []
+        for i in range(20):
+            year = 2007 + i
+            name = f"news-crawl_news.{year}"
+            url = (
+                f"https://data.statmt.org/news-crawl/{lang}/news.{year}.{lang}.shuffled.deduped.gz"
+            )
+            size = get_remote_file_size(url, display_not_200=False)
+            if size is not None:
+                datasets.append((name, url, size))
+
+        print("")
+        print("┌─────────────────────────────────────────────────────────────────────┐")
+        print(f"│ news-crawl ({lang}) - https://github.com/data.statmt.org/news-crawl     │")
+        print("└─────────────────────────────────────────────────────────────────────┘")
+        print_table(
+            [
+                [
+                    "Dataset",
+                    "URL",
+                    "Size",
+                ],
+                *[[name, url, size] for name, url, size in datasets],
+            ]
+        )
+
+        print_yaml([name for name, _, _ in datasets])
+
+
 def print_yaml(names: list[str], exclude: list[str] = []):
     cleaned = set()
     for name in names:
@@ -441,6 +473,7 @@ def main(args: Optional[list[str]] = None) -> None:
         "huggingface_mono",
         "huggingface_parallel",
         "huggingface_any",
+        "news-crawl",
     ]
     parser = argparse.ArgumentParser(
         description=__doc__,
@@ -488,6 +521,9 @@ def main(args: Optional[list[str]] = None) -> None:
     if args.importer == "huggingface_any" or not args.importer:
         get_huggingface_any(args.target if args.source == "en" else args.source)
 
+    if args.importer == "news-crawl" or not args.importer:
+        get_news_crawl(args.source, args.target)
+
 
 if __name__ == "__main__":
     main()