aboutcode-org#3659 Fix copyright detection normalization

arshad-muhammad · Oct 4, 2024 · 5f167e1 · 5f167e1
1 parent 0d2ce3f
commit 5f167e1
Show file tree

Hide file tree

Showing 2 changed files with 46 additions and 0 deletions.
diff --git a/src/cluecode/copyrights.py b/src/cluecode/copyrights.py
@@ -26,6 +26,29 @@
 
 from cluecode import copyrights_hint
 
+from cluecode.normalizer import normalize_copyright_symbols
+
+def detect_copyrights(file_path):
+    # Read the content of the file
+    with open(file_path, 'r', encoding='utf-8') as file:
+        text = file.read()
+
+    # Normalize the text before processing it
+    normalized_text = normalize_copyright_symbols(text)
+
+    # Save the normalized content back to the file (optional)
+    with open(file_path, 'w', encoding='utf-8') as file:
+        file.write(normalized_text)
+
+    return normalized_text
+
+# Specify the path to your document directly here
+file_path = "./copyright.py"
+
+# Call the function and print the result
+normalized_content = detect_copyrights(file_path)
+print(normalized_content)
+
 # Tracing flags
 TRACE = False or os.environ.get('SCANCODE_DEBUG_COPYRIGHT', False)
 

diff --git a/src/cluecode/copyrights_hint.py b/src/cluecode/copyrights_hint.py
@@ -14,6 +14,29 @@
 # A regex to match a string that may contain a copyright year.
 # This is a year between 1960 and today prefixed and suffixed with
 # either a white-space or some punctuation.
+from cluecode.normalizer import normalize_copyright_symbols
+
+def detect_copyrights(file_path):
+    # Read the content of the file
+    with open(file_path, 'r', encoding='utf-8') as file:
+        text = file.read()
+
+    # Normalize the text before processing it
+    normalized_text = normalize_copyright_symbols(text)
+
+    # Save the normalized content back to the file (optional)
+    with open(file_path, 'w', encoding='utf-8') as file:
+        file.write(normalized_text)
+
+    return normalized_text
+
+# Specify the path to your document directly here
+file_path = "./copyright.py"
+
+# Call the function and print the result
+normalized_content = detect_copyrights(file_path)
+print(normalized_content)
+
 
 all_years = tuple(str(year) for year in range(1960, datetime.today().year))
 years = r'[\(\.,\-\)\s]+(' + '|'.join(all_years) + r')([\(\.,\-\)\s]+|$)'