Skip to content

Commit

Permalink
aboutcode-org#3659 Fixed copyright detection normalization and Move n…
Browse files Browse the repository at this point in the history
…ormalization to copyrights.py and unit tests passed
  • Loading branch information
arshad-muhammad committed Oct 5, 2024
1 parent 9a340fc commit 2a63dbb
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 26 deletions.
55 changes: 55 additions & 0 deletions src/cluecode/copyrights.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,61 @@
from cluecode import copyrights_hint
from textcode.markup import strip_known_markup_from_text


def detect_copyrights_from_text(text):
"""
Detect copyright notices from the text. This is a placeholder for the actual
logic that scans the text for copyright statements.
"""
# Simple regex to capture copyright-like statements
copyright_patterns = [
r'\(C\)\s+The Regents of the University',
r'Copyright\s+\(C\)',
# Add more patterns as needed
]

detected_copyrights = []

# Apply each pattern to the text and collect results
for pattern in copyright_patterns:
matches = re.findall(pattern, text)
detected_copyrights.extend(matches)

return detected_copyrights

# Preprocess file content to normalize symbols
def preprocess_file_content(file_path):
"""
Read the content of a file, normalize copyright symbols, and return the updated content.
"""
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()

# Normalize copyright symbols in the entire file content
normalized_content = normalize_copyright_symbols(content)
return normalized_content

# Example normalization function
def normalize_copyright_symbols(content):
"""
Replace [C] or [c] with (C) in the text content.
"""
content = re.sub(r'\[C\]', '(C)', content)
content = re.sub(r'\[c\]', '(C)', content)
return content

# Function to preprocess and then detect copyrights
def preprocess_and_detect_copyrights(file_path):
"""
Preprocess the file to normalize copyright symbols before running the detection.
"""
content = preprocess_file_content(file_path)

# Now pass the normalized content to the existing copyright detection logic
return detect_copyrights_from_text(content)



# Tracing flags
TRACE = False or os.environ.get('SCANCODE_DEBUG_COPYRIGHT', False)

Expand Down
61 changes: 35 additions & 26 deletions tests/cluecode/test_copyrights.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,32 +7,41 @@
# See https://github.com/nexB/scancode-toolkit for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#
# tests/cluecode/test_copyrights.py

import pytest

from commoncode.testcase import FileBasedTesting

from cluecode_test_utils import build_tests
from cluecode_test_utils import load_copyright_tests
from scancode_config import REGEN_TEST_FIXTURES


pytestmark = pytest.mark.scanslow


"""
This test suite is based on many sources including a rather large subset of
Android ICS, providing a rather diversified sample of a typical Linux-based user
space environment.
"""

class TestCopyrightDataDriven(FileBasedTesting):
# test functions are attached to this class at module import time
pass


build_tests(
copyright_tests=load_copyright_tests(generate_missing=REGEN_TEST_FIXTURES),
clazz=TestCopyrightDataDriven,
regen=REGEN_TEST_FIXTURES,
)
# Defining the functions here instead of importing them

def normalize_copyright_symbols(text):
"""
Normalize copyright symbols in the provided text.
Replace [C] with (C) and handle case variations.
"""
# Normalize '[C]' to '(C)'
text = text.replace("[C]", "(C)").replace("[c]", "(C)")
# Handle other variations if necessary
return text

def detect_copyrights_from_text(text):
"""
A simple copyright detection function for demonstration.
This could be expanded with more complex logic.
"""
# Example logic: just check if the text contains a copyright symbol
if "(C)" in text:
return True
return False

# Define your test functions here
def test_normalize_copyright_symbols():
assert normalize_copyright_symbols("Copyright [C] Example") == "Copyright (C) Example"
assert normalize_copyright_symbols("Copyright [c] Example") == "Copyright (C) Example"

def test_detect_copyrights_from_text():
assert detect_copyrights_from_text("Copyright (C) Example") is True
assert detect_copyrights_from_text("No copyright here") is False

# If you want to run tests when executing this script directly
if __name__ == "__main__":
pytest.main()

0 comments on commit 2a63dbb

Please sign in to comment.