|
2 | 2 | import os
|
3 | 3 | from pathlib import Path
|
4 | 4 |
|
5 |
| -import chardet |
| 5 | +import charset_normalizer |
6 | 6 |
|
7 | 7 | from manubot_ai_editor import env_vars
|
8 | 8 | from manubot_ai_editor.prompt_config import ManuscriptPromptConfig, IGNORE_FILE
|
@@ -282,13 +282,25 @@ def revise_file(
|
282 | 282 | if section_name is None:
|
283 | 283 | section_name = self.get_section_from_filename(input_filename)
|
284 | 284 |
|
285 |
| - # detect the input file encoding using chardet |
| 285 | + # apply encoding settings via the env vars AI_EDITOR_SRC_ENCODING and AI_EDITOR_DEST_ENCODING, |
| 286 | + # if specified; otherwise, detect the encoding using charset_normalizer |
| 287 | + src_encoding = os.environ.get(env_vars.SRC_ENCODING) |
| 288 | + dest_encoding = os.environ.get(env_vars.DEST_ENCODING) |
| 289 | + |
| 290 | + # detect the input file encoding using charset_normalizer |
286 | 291 | # maintain that encoding when reading and writing files
|
287 |
| - src_encoding = chardet.detect(input_filepath.read_bytes())["encoding"] |
| 292 | + if src_encoding is None: |
| 293 | + src_encoding = charset_normalizer.detect(input_filepath.read_bytes())["encoding"] |
| 294 | + |
| 295 | + # ensure that we have a valid encoding for the output file |
| 296 | + if dest_encoding is None: |
| 297 | + dest_encoding = src_encoding |
288 | 298 |
|
289 | 299 | print("Detected encoding:", src_encoding, flush=True)
|
290 | 300 |
|
291 |
| - with open(input_filepath, "r", encoding=src_encoding) as infile, open(output_filepath, "w", encoding=src_encoding) as outfile: |
| 301 | + with open(input_filepath, "r", encoding=src_encoding) as infile, \ |
| 302 | + open(output_filepath, "w", encoding=dest_encoding) as outfile: |
| 303 | + |
292 | 304 | # Initialize a temporary list to store the lines of the current paragraph
|
293 | 305 | paragraph = []
|
294 | 306 |
|
|
0 commit comments