Skip to content

Commit 0f5a029

Browse files
miguelusquenicoleeeluo
authored andcommitted
Fix issue NVIDIA#43 (empty files creation) and improve reading/writing speed (NVIDIA#57)
This commit fixes issue NVIDIA#43 (empty files created when invoking reshard_jsonl method at nemo_curator.utils.file_utils.py) by double-checking the files size after being generated, and deleting them with size zero. In addition to that, I have noticed there is no need to parse to JSON object the content of the different lines, which should be already in json format. By removing that extra-parsing, there is a significant speed up in the execution of this method. Signed-off-by: Miguel Martínez <[email protected]> Signed-off-by: Nicole Luo <[email protected]>
1 parent 794a435 commit 0f5a029

File tree

1 file changed

+17
-6
lines changed

1 file changed

+17
-6
lines changed

nemo_curator/utils/file_utils.py

+17-6
Original file line numberDiff line numberDiff line change
@@ -181,9 +181,8 @@ def parse_str_of_num_bytes(s, return_str=False):
181181
def _save_jsonl(documents, output_path, start_index=0, max_index=10000, prefix=None):
182182
"""Worker function to write out the data to jsonl files"""
183183

184-
def _output_json(document):
185-
myjson = json.dumps(document, ensure_ascii=False)
186-
return myjson.encode("utf-8")
184+
def _encode_text(document):
185+
return document.strip().encode("utf-8")
187186

188187
def _name(start_index, npad, prefix, i):
189188
tag = str(start_index + i).rjust(npad, "0")
@@ -195,11 +194,22 @@ def _name(start_index, npad, prefix, i):
195194

196195
output_glob_string = os.path.join(output_path, "*.jsonl")
197196

198-
documents.map(_output_json).to_textfiles(
197+
output_files = documents.map(_encode_text).to_textfiles(
199198
output_glob_string,
200199
name_function=name,
201200
)
202201

202+
# Delete empty files generated due to empty partitions in the bag
203+
for output_file in output_files:
204+
try:
205+
if os.path.getsize(output_file) == 0:
206+
os.remove(output_file)
207+
except Exception as exception:
208+
print(
209+
f"An exception occurred when trying to delete {output_file}.\n{exception}",
210+
flush=True,
211+
)
212+
203213

204214
def reshard_jsonl(
205215
input_dir, output_dir, output_file_size="100M", start_index=0, file_prefix=""
@@ -212,7 +222,8 @@ def reshard_jsonl(
212222
output_dir: The output directory where the resharded jsonl files will be written
213223
output_file_size: Approximate size of output files. Must specify with a string and
214224
with the unit K, M or G for kilo, mega or gigabytes
215-
start_index: Starting index for naming the output files
225+
start_index: Starting index for naming the output files. Note: The indices may not
226+
be continuous if the sharding process would output an empty file in its place
216227
file_prefix: Prefix to use to prepend to output file number
217228
"""
218229

@@ -222,7 +233,7 @@ def reshard_jsonl(
222233
input_files = list(get_all_files_paths_under(input_dir))
223234

224235
# Read in the dask bag
225-
b = db.read_text(input_files, blocksize=blocksize).map(json.loads)
236+
b = db.read_text(input_files, blocksize=blocksize)
226237

227238
# Prepare the output
228239
output_dir = expand_outdir_and_mkdir(output_dir)

0 commit comments

Comments
 (0)