1+ import os
12import json
23import logging
34import subprocess
4- import sys
55from argparse import ArgumentParser
66from pathlib import Path
77from statistics import mean
88
99import datasets
10- from bs4 import BeautifulSoup
11- from bs4 .dammit import EncodingDetector
1210from datasets import config , load_from_disk
1311from datasets .utils .logging import set_verbosity_info
1412
1513set_verbosity_info ()
1614logger = logging .getLogger (__name__ )
1715
18- # For `soup.decode_content` that can hit the limit
19- sys .setrecursionlimit (10000 )
20-
2116
2217def get_args ():
2318 parser = ArgumentParser ()
@@ -59,6 +54,11 @@ def main():
5954 args = get_args ()
6055 logger .info (f"** The job is runned with the following arguments: **\n { args } \n **** " )
6156
57+ if os .path .isfile (args .save_path_stats_json ):
58+ logger .info (f" --- Statistics already computed for seed id { args .seed_id } " )
59+ return
60+
61+ logger .info (f" --- Statistics not already computed for seed id { args .seed_id } " )
6262 if not args .use_datasets_caching :
6363 datasets .set_caching_enabled (False )
6464 else :
@@ -92,8 +92,10 @@ def main():
9292
9393 ds_html = splits [selected_mime_types [0 ]]
9494
95+ logger .info (f"the currents splits are { data_stats } ." )
96+
9597 def get_length_text (example ):
96- example ["length_text" ] = len (example ["text" ])
98+ example ["length_text" ] = len (example ["text" ]) if example [ "text" ] is not None else 0
9799 return example
98100
99101 cols_to_remove = [col for col in ds .column_names if col not in ["content_languages" , "url_host_tld" ]]
@@ -105,7 +107,9 @@ def get_length_text(example):
105107 )
106108
107109 data_stats ["html_empty_text" ] = len ([e for e in ds_html ["length_text" ] if e == 0 ])
108- data_stats ["html_mean_length_non_empty_text" ] = mean ([e for e in ds_html ["length_text" ] if e != 0 ])
110+
111+ non_empty_texts = [e for e in ds_html ["length_text" ] if e != 0 ]
112+ data_stats ["html_mean_length_non_empty_text" ] = mean (non_empty_texts ) if non_empty_texts != [] else None
109113 data_stats ["seed_id" ] = args .seed_id
110114
111115 logger .info (f"There is { data_stats ['html_empty_text' ]} empty text rows out of { len (ds_html )} rows." )
@@ -119,7 +123,8 @@ def get_length_text(example):
119123 subprocess .run (["mv" , save_path_tmp , str (save_path .absolute ())])
120124
121125 save_path = Path (args .save_path_stats_full_json )
122- save_path_tmp = f"{ str (save_path .absolute ())} .tmp"
126+ tmp_file_name = f"tmp-{ str (save_path .name )} "
127+ save_path_tmp = os .path .join (save_path .parent , tmp_file_name )
123128 logger .info (f"Saving the dataset at { save_path_tmp } " )
124129 ds_html .to_json (
125130 save_path_tmp ,
0 commit comments