Skip to content

Commit e31679e

Browse files
committed
Switches from chardet to charset_normalizer. Adds SRC/DEST_ENCODING env vars to specify src/dest encoding manually. Other minor touchups.
1 parent 93cb08f commit e31679e

File tree

3 files changed

+30
-5
lines changed

3 files changed

+30
-5
lines changed

libs/manubot_ai_editor/editor.py

+16-4
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import os
33
from pathlib import Path
44

5-
import chardet
5+
import charset_normalizer
66

77
from manubot_ai_editor import env_vars
88
from manubot_ai_editor.prompt_config import ManuscriptPromptConfig, IGNORE_FILE
@@ -282,13 +282,25 @@ def revise_file(
282282
if section_name is None:
283283
section_name = self.get_section_from_filename(input_filename)
284284

285-
# detect the input file encoding using chardet
285+
# apply encoding settings via the env vars AI_EDITOR_SRC_ENCODING and AI_EDITOR_DEST_ENCODING,
286+
# if specified; otherwise, detect the encoding using charset_normalizer
287+
src_encoding = os.environ.get(env_vars.SRC_ENCODING)
288+
dest_encoding = os.environ.get(env_vars.DEST_ENCODING)
289+
290+
# detect the input file encoding using charset_normalizer
286291
# maintain that encoding when reading and writing files
287-
src_encoding = chardet.detect(input_filepath.read_bytes())["encoding"]
292+
if src_encoding is None:
293+
src_encoding = charset_normalizer.detect(input_filepath.read_bytes())["encoding"]
294+
295+
# ensure that we have a valid encoding for the output file
296+
if dest_encoding is None:
297+
dest_encoding = src_encoding
288298

289299
print("Detected encoding:", src_encoding, flush=True)
290300

291-
with open(input_filepath, "r", encoding=src_encoding) as infile, open(output_filepath, "w", encoding=src_encoding) as outfile:
301+
with open(input_filepath, "r", encoding=src_encoding) as infile, \
302+
open(output_filepath, "w", encoding=dest_encoding) as outfile:
303+
292304
# Initialize a temporary list to store the lines of the current paragraph
293305
paragraph = []
294306

libs/manubot_ai_editor/env_vars.py

+13
Original file line numberDiff line numberDiff line change
@@ -70,3 +70,16 @@
7070
# The complete list of placeholders is: {paragraph_text}, {section_name},
7171
# {title}, {keywords}.
7272
CUSTOM_PROMPT = "AI_EDITOR_CUSTOM_PROMPT"
73+
74+
# Specifies the source and destination encodings of input and output markdown
75+
# files. Behavior is as follows:
76+
# - If neither SRC_ENCODING nor DEST_ENCODING are specified, the tool will
77+
# attempt to identify the encoding using the charset_normalizer library and
78+
# use that encoding to both read and write the output files.
79+
# - If only SRC_ENCODING is specified, it will be used to both read and write
80+
# the files.
81+
# - If only DEST_ENCODING is specified, it will be used to write the output
82+
# files, and the input files will be read using the encoding identified by
83+
# charset_normalizer.
84+
SRC_ENCODING = "AI_EDITOR_SRC_ENCODING"
85+
DEST_ENCODING = "AI_EDITOR_DEST_ENCODING"

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
install_requires=[
2828
"openai==0.28",
2929
"pyyaml",
30-
"chardet==5.2.0",
30+
"charset_normalizer==3.4.0"
3131
],
3232
classifiers=[
3333
"Programming Language :: Python :: 3",

0 commit comments

Comments
 (0)