Switches from chardet to charset_normalizer. Adds SRC/DEST_ENCODING env vars to specify src/dest encoding manually. Other minor touchups.

falquaddoomi · falquaddoomi · commit e31679e81ce8 · 2024-11-12T16:30:16.000-07:00
diff --git a/libs/manubot_ai_editor/editor.py b/libs/manubot_ai_editor/editor.py
@@ -2,7 +2,7 @@
 import os
 from pathlib import Path
 
-import chardet
+import charset_normalizer
 
 from manubot_ai_editor import env_vars
 from manubot_ai_editor.prompt_config import ManuscriptPromptConfig, IGNORE_FILE
@@ -282,13 +282,25 @@ def revise_file(
         if section_name is None:
             section_name = self.get_section_from_filename(input_filename)
 
-        # detect the input file encoding using chardet
+        # apply encoding settings via the env vars AI_EDITOR_SRC_ENCODING and AI_EDITOR_DEST_ENCODING,
+        # if specified; otherwise, detect the encoding using charset_normalizer
+        src_encoding = os.environ.get(env_vars.SRC_ENCODING)
+        dest_encoding = os.environ.get(env_vars.DEST_ENCODING)
+
+        # detect the input file encoding using charset_normalizer
         # maintain that encoding when reading and writing files
-        src_encoding = chardet.detect(input_filepath.read_bytes())["encoding"]
+        if src_encoding is None:
+            src_encoding = charset_normalizer.detect(input_filepath.read_bytes())["encoding"]
+
+        # ensure that we have a valid encoding for the output file
+        if dest_encoding is None:
+            dest_encoding = src_encoding
 
         print("Detected encoding:", src_encoding, flush=True)
 
-        with open(input_filepath, "r", encoding=src_encoding) as infile, open(output_filepath, "w", encoding=src_encoding) as outfile:
+        with open(input_filepath,  "r", encoding=src_encoding) as infile, \
+             open(output_filepath, "w", encoding=dest_encoding) as outfile:
+            
             # Initialize a temporary list to store the lines of the current paragraph
             paragraph = []
 
diff --git a/libs/manubot_ai_editor/env_vars.py b/libs/manubot_ai_editor/env_vars.py
@@ -70,3 +70,16 @@
 # The complete list of placeholders is: {paragraph_text}, {section_name},
 # {title}, {keywords}.
 CUSTOM_PROMPT = "AI_EDITOR_CUSTOM_PROMPT"
+
+# Specifies the source and destination encodings of input and output markdown
+# files. Behavior is as follows:
+# - If neither SRC_ENCODING nor DEST_ENCODING are specified, the tool will
+#   attempt to identify the encoding using the charset_normalizer library and
+#   use that encoding to both read and write the output files.
+# - If only SRC_ENCODING is specified, it will be used to both read and write
+#   the files.
+# - If only DEST_ENCODING is specified, it will be used to write the output
+#   files, and the input files will be read using the encoding identified by
+#   charset_normalizer.
+SRC_ENCODING = "AI_EDITOR_SRC_ENCODING"
+DEST_ENCODING = "AI_EDITOR_DEST_ENCODING"
diff --git a/setup.py b/setup.py
@@ -27,7 +27,7 @@
     install_requires=[
         "openai==0.28",
         "pyyaml",
-        "chardet==5.2.0",
+        "charset_normalizer==3.4.0"
     ],
     classifiers=[
         "Programming Language :: Python :: 3",