Skip to content

Commit

Permalink
Add support for bad file recovery (#5)
Browse files Browse the repository at this point in the history
Co-authored-by: Axel Dahl <[email protected]>
  • Loading branch information
whisperstream and axel-dahl-zoom authored Aug 23, 2021
1 parent 240766a commit 61eac9f
Showing 1 changed file with 7 additions and 3 deletions.
10 changes: 7 additions & 3 deletions duplicate_code_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,9 +135,13 @@ def run(fail_threshold, directories, files, ignore_directories, ignore_files,
# Parse the contents of all the source files
source_code = OrderedDict()
for source_code_file in source_code_files:
with open(source_code_file, 'r') as f:
# Store source code with the file path as the key
source_code[source_code_file] = f.read()
try:
# read file but also recover from encoding errors in source files
with open(source_code_file, 'r', errors='surrogateescape') as f:
# Store source code with the file path as the key
source_code[source_code_file] = f.read()
except Exception as err:
print(f'ERROR: Failed to open file {source_code_file}, reason: {str(err)}')

# Create a Similarity object of all the source code
gen_docs = [[word.lower() for word in word_tokenize(source_code[source_file])]
Expand Down

0 comments on commit 61eac9f

Please sign in to comment.