Skip to content

Commit

Permalink
aboutcode-org#3659 Fixed copyright detection normalization and Move n…
Browse files Browse the repository at this point in the history
…ormalization to copyrights.py and unit tests passed
  • Loading branch information
arshad-muhammad committed Oct 5, 2024
2 parents 2a63dbb + 5d50052 commit 6126c28
Show file tree
Hide file tree
Showing 1,040 changed files with 103,975 additions and 405 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ v33.0.0 (next next, roadmap)
- New and improved copyright detection with many false positive removed
and refined detection added.

- Fix Python ``SyntaxWarning`` in textcode module.

v32.2.1 - 2024-07-02
---------------------
Expand Down
1 change: 1 addition & 0 deletions etc/scripts/licenses/buildrules.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def __attrs_post_init__(self, *args, **kwargs):
print(rdat)
print("########################################################")
raise
self.data = {k: v for k, v in self.data.items() if v is not None or (v is None and k == "license_expression")}


def load_data(location="00-new-licenses.txt"):
Expand Down
94 changes: 73 additions & 21 deletions src/cluecode/copyrights.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from pygmars import Token
from pygmars.tree import Tree


from cluecode import copyrights_hint
from textcode.markup import strip_known_markup_from_text

Expand Down Expand Up @@ -162,8 +161,24 @@ def detect_copyrights(
Strip markup from text if ``demarkup`` is True.
Run for up to ``deadline`` seconds and return results found so far.
"""
from cluecode.linux_credits import detect_credits_authors

from textcode.analysis import numbered_text_lines

if include_authors:
author_detections = list(detect_credits_authors(location))

if TRACE:
logger_debug('detect_copyrights: detect_credits_authors')
for detecta in author_detections:
logger_debug(f' {detecta}')

# bail out if we have a credits file with credits
if author_detections:
for a in author_detections:
yield a
return

numbered_lines = list(numbered_text_lines(location, demarkup=True))

if TRACE or TRACE_TOK:
Expand Down Expand Up @@ -716,8 +731,9 @@ def build_detection_from_node(
# Slovenian: avtorske pravice
# Ukrainian: авторське право

# rare typo copyrighy
# rare typos incopyright
(r'^Copyrighy$', 'COPY'),
(r'^Copyirght$', 'COPY'),

# OSGI
(r'^Bundle-Copyright', 'COPY'),
Expand Down Expand Up @@ -959,6 +975,7 @@ def build_detection_from_node(
(r'^[Ss]tring$', 'JUNK'),
(r'^Implementation-Vendor$', 'JUNK'),
(r'^dnl$', 'JUNK'),
(r'^ifndef$', 'JUNK'),

(r'^as$', 'NN'),
(r'^[Vv]isit$', 'JUNK'),
Expand Down Expand Up @@ -994,7 +1011,6 @@ def build_detection_from_node(
(r'^Add$', 'JUNK'),
(r'^Average$', 'JUNK'),
(r'^Taken$', 'JUNK'),
(r'^LAWS\.?$', 'JUNK'),
(r'^design$', 'JUNK'),
(r'^Driver$', 'JUNK'),
(r'^[Cc]ontribution\.?', 'JUNK'),
Expand All @@ -1004,7 +1020,7 @@ def build_detection_from_node(
(r'^Last-Translator$', 'JUNK'),
(r'^Translated$', 'JUNK'),
(r'^OMAP730$', 'JUNK'),
(r'^Law\.$', 'JUNK'),

(r'^dylid$', 'JUNK'),
(r'^BeOS$', 'JUNK'),
(r'^Generates?$', 'JUNK'),
Expand Down Expand Up @@ -1046,7 +1062,6 @@ def build_detection_from_node(
(r'^Disclaimer$', 'JUNK'),
(r'^Directive.?$', 'JUNK'),
(r'^LAWS\,?$', 'JUNK'),
(r'^[Ll]aws?,?$', 'JUNK'),
(r'^me$', 'JUNK'),
(r'^Derived$', 'JUNK'),
(r'^Limitations?$', 'JUNK'),
Expand Down Expand Up @@ -1117,7 +1132,15 @@ def build_detection_from_node(
(r'^Much$', 'JUNK'),
(r'^remains?,?$', 'JUNK'),
(r'^earlier$', 'JUNK'),
(r'^[lL]aws?$', 'JUNK'),

# there is a Mr. Law
(r'^Law[\.,]?$', 'NN'),
(r'^laws?[\.,]?$', 'JUNK'),
(r'^Laws[\.,]?$', 'JUNK'),
(r'^LAWS?[\.,]?$', 'JUNK'),
(r'^LAWS?$', 'NN'),

(r'^taken$', 'NN'),
(r'^Insert$', 'JUNK'),
(r'^url$', 'JUNK'),
(r'^[Ss]ee$', 'JUNK'),
Expand All @@ -1138,6 +1161,7 @@ def build_detection_from_node(
(r'^[Ii]nterfaces?,?$', 'JUNK'),
(r'^than$', 'JUNK'),
(r'^whom$', 'JUNK'),
(r'^Definitions?$', 'JUNK'),
(r'^However,?$', 'JUNK'),
(r'^[Cc]ollectively$', 'JUNK'),
(r'^following$', 'FOLLOWING'),
Expand Down Expand Up @@ -1245,7 +1269,8 @@ def build_detection_from_node(
(r'^[a-z]{3,10}[A-Z][a-z]{3,10}$', 'JUNK'),

(r'^\$?Guid$', 'JUNK'),
#(r'^Small$', 'NN'),
# there is a Mr Small
# (r'^Small$', 'NN'),
(r'^implementing$', 'JUNK'),
(r'^Unlike$', 'JUNK'),
(r'^using$', 'JUNK'),
Expand Down Expand Up @@ -1326,6 +1351,7 @@ def build_detection_from_node(
(r'^[Ss]tatements?.?$', 'JUNK'),
(r'^issues?.?$', 'JUNK'),
(r'^retain?.?$', 'JUNK'),
(r'^Sun3x$', 'JUNK'),

############################################################################
# Nouns and proper Nouns
Expand All @@ -1336,7 +1362,7 @@ def build_detection_from_node(
(r'^This_file_is_part_of_KDE$', 'NAME'),

# K.K. (a company suffix), needs special handling
(r'^K.K.,?$', 'NAME'),
(r'^K.K.,?$', 'COMP'),

# MIT is problematic
# With a comma, always CAPS (MIT alone is too error prone to be always tagged as CAPS
Expand Down Expand Up @@ -1417,6 +1443,7 @@ def build_detection_from_node(
(r'^DATED$', 'NN'),
(r'^Delay', 'NN'),
(r'^Derivative', 'NN'),
(r'^Direct$', 'NN'),
(r'^DISCLAIMED', 'NN'),
(r'^Docs?$', 'NN'),
(r'^DOCUMENTATION', 'NN'),
Expand Down Expand Up @@ -1506,10 +1533,13 @@ def build_detection_from_node(
(r'^GPLd?\.?$', 'NN'),
(r'^GPL\'d$', 'NN'),
(r'^Gnome$', 'NN'),
(r'^Port$', 'NN'),
(r'^GnuPG$', 'NN'),
(r'^Government.', 'NNP'),
(r'^OProfile$', 'NNP'),
(r'^Government$', 'COMP'),
# there is a Ms. Grant
(r'^Grant$', 'NNP'),
(r'^Grants?\.?,?$', 'NN'),
(r'^Header', 'NN'),
(r'^HylaFAX$', 'NN'),
Expand Down Expand Up @@ -1546,7 +1576,6 @@ def build_detection_from_node(
(r'^List$', 'NN'),
(r'^Set$', 'NN'),
(r'^Last$', 'NN'),
(r'^LAW', 'NN'),
(r'^Legal$', 'NN'),
(r'^LegalTrademarks$', 'NN'),
(r'^Library$', 'NN'),
Expand Down Expand Up @@ -1699,6 +1728,11 @@ def build_detection_from_node(
(r'^CodeMirror$', 'NN'),
(r'^They$', 'JUNK'),
(r'^Branched$', 'NN'),
(r'^Partial$', 'NN'),
(r'^Fixed$', 'NN'),
(r'^Later$', 'NN'),
(r'^Rear$', 'NN'),
(r'^Left$', 'NN'),

(r'^Improved$', 'NN'),
(r'^Designed$', 'NN'),
Expand Down Expand Up @@ -1767,11 +1801,12 @@ def build_detection_from_node(
(r'^Compression$', 'NN'),
(r'^Letter$', 'NN'),
(r'^Moved$', 'NN'),
(r'^More$', 'NN'),
(r'^Phone$', 'NN'),
(r'^[Tt]ests?$', 'JUNK'),

(r'^Inputs?$', 'NN'),


# dual caps that are not NNP
(r'^Make[A-Z]', 'JUNK'),
(r'^Create[A-Z]', 'JUNK'),
Expand Down Expand Up @@ -1959,12 +1994,11 @@ def build_detection_from_node(
(r'^(S\.?A\.?S?|Sas|sas|A\/S|AG,?|AB|Labs?|[Cc][Oo]|Research|Center|INRIA|Societe|KG)[,\.]?$', 'COMP'),
# French SARL
(r'^(SARL|S\.A\.R\.L\.)[\.,\)]*$', 'COMP'),
# More company suffix : a.s. in Czechia and otehrs
# More company suffix : a.s. in Czechia and others
(r'^(a\.s\.|S\.r\.l\.?)$', 'COMP'),
(r'^Vertriebsges\.m\.b\.H\.?,?$', 'COMP'),
# Iceland
(r'^(ehf|hf|svf|ohf)\.,?$', 'COMP'),

# Move company abbreviations
(r'^(SPRL|srl)[\.,]?$', 'COMP'),
# Poland
Expand Down Expand Up @@ -2231,6 +2265,7 @@ def build_detection_from_node(
(r'^Meridian\'93$', 'NNP'),
(r'^Xiph.Org$', 'NNP'),
(r'^iClick,?$', 'NNP'),
(r'^electronics?$', 'NNP'),

# proper nouns with digits
(r'^([A-Z][a-z0-9]+){1,2}[\.,]?$', 'NNP'),
Expand Down Expand Up @@ -2258,6 +2293,9 @@ def build_detection_from_node(
(r'^AT$', '<at>'),
(r'^DOT$', 'DOT'),

# exceptions to CAPS
(r'^MMC$', 'JUNK'),

# all CAPS word, at least 1 char long such as MIT, including an optional trailing comma or dot
(r'^[A-Z0-9]+,?$', 'CAPS'),

Expand Down Expand Up @@ -2327,10 +2365,9 @@ def build_detection_from_node(
(r'__MyCompanyName__[\.,]?$', 'NAME'),

# email in brackets <brett_AT_jdom_DOT_org>
#(karl AT indy.rr.com)
#<fdlibm-comments AT sun.com>
# (karl AT indy.rr.com)
# <fdlibm-comments AT sun.com>
(r'(?i:^[<\(][\w\.\-\+]+at[\w\.\-\+]+(dot)?[\w\.\-\+]+[/)>]$)', 'EMAIL'),


# Code variable names including snake case
(r'^.*(_.*)+$', 'JUNK'),
Expand Down Expand Up @@ -2366,7 +2403,6 @@ def build_detection_from_node(
(r'^(?:=>|->|<-|<=)$', 'JUNK'),

(r'^semiconductors?[\.,]?$', 'NNP'),


############################################################################
# catch all other as Nouns
Expand Down Expand Up @@ -2644,6 +2680,7 @@ def build_detection_from_node(
NAME-YEAR: {<YR-RANGE> <NAME-EMAIL|COMPANY>+ <CC> <YR-RANGE>} #540
NAME: {<NAME|NAME-EMAIL>+ <OF> <NNP> <OF> <NN>? <COMPANY>} #550
NAME: {<NAME|NAME-EMAIL>+ <CC|OF>? <NAME|NAME-EMAIL|COMPANY>} #560
NAME: {<NNP><NNP>} #561
Expand Down Expand Up @@ -2677,8 +2714,13 @@ def build_detection_from_node(
#also accept trailing email and URLs
# and "VAN" e.g. Du: Copyright (c) 2008 Alek Du <[email protected]>
NAME-YEAR: {<NAME-YEAR> <VAN>? <EMAIL>?<URL>?} #5701
# Copyright (C) 2008 Jim Law - Iris LP All rights reserved.
NAME-YEAR: {<NAME-YEAR> <NN> <DASH> <NAME>} # 5701.1
NAME-YEAR: {<NAME-YEAR>+} #5702
NAME: {<NNP> <OF> <NNP>} #580
NAME: {<NAME> <NNP>} #590
NAME: {<NN|NNP|CAPS>+ <CC> <OTH>} #600
Expand Down Expand Up @@ -2898,6 +2940,10 @@ def build_detection_from_node(
# Copyright (c) 2013-2015 Streams Standard Reference Implementation Authors
COPYRIGHT: {<COPY>+ <NAME-YEAR> <NN|NNP>+ <AUTHS>} #1566
# Nicolas Pitre, (c) 2002 Monta Vista Software Inc
# Cliff Brake, (c) 2001
#COPYRIGHT: {<NAME> <COPY> <NAME-YEAR> <NAME> <COPY> <YR-RANGE>} #1566.1
# copyright: Copyright (c) Joe Joyce and contributors, 2016-2019.
COPYRIGHT: {<COPY>+ <NAME> <CC> <NN> <YR-RANGE>} #1579992
Expand Down Expand Up @@ -3082,8 +3128,11 @@ def build_detection_from_node(
# Author: Jeff LaBundy <[email protected]>
COPYRIGHT: {<COPY> <COPY> <YR-RANGE> <AUTH> <NAME-EMAIL>} #2280-3
COPYRIGHT2: {<COPY>+ <NN|CAPS>? <YR-RANGE>+ <PN>*} #2280
COPYRIGHT: {<COPYRIGHT2> <BY> <NAME-YEAR|NAME-EMAIL> <BY>? <NAME-YEAR|NAME-EMAIL>? } #2280-4
# using #2280 above: Copyright 2018 Developers of the Rand project
COPYRIGHT: {<COPYRIGHT2> <MAINT> <OF> <COMPANY>} #2280.123
Expand Down Expand Up @@ -3206,7 +3255,8 @@ def build_detection_from_node(
COPYRIGHT: {<COPYRIGHT2> <CAPS> <CD|CDS> <COMPANY> <NAME>} #2009.1
# COPYRIGHT (c) 2006 - 2009 DIONYSOS
COPYRIGHT: {<COPYRIGHT2> <CAPS>} #2009
# Copyright 2003 ICT CAS
COPYRIGHT: {<COPYRIGHT2> <CAPS>+} #2009
# Copyright (C) 2000 See Beyond Communications Corporation
COPYRIGHT2: {<COPYRIGHT2> <JUNK> <COMPANY>} # 2010
Expand Down Expand Up @@ -3404,7 +3454,7 @@ def build_detection_from_node(
#Copyright (C) 2012-2016 by the following authors:
#- Wladimir J. van der Laan <[email protected]>
NAME-EMAIL: {<NNP> <NAME-EMAIL> } #157999.13
NAME-EMAIL: {<NNP> <NAME-EMAIL> } #157999.13
NAME-EMAIL: {<DASH> <NAME-EMAIL> <NN>?} #157999.14
COPYRIGHT: {<COPYRIGHT2> <FOLLOWING> <AUTHS> <NAME-EMAIL>+ } #157999.14
Expand Down Expand Up @@ -3943,6 +3993,8 @@ def is_junk_copyright(s, patterns=COPYRIGHTS_JUNK_PATTERN_MATCHERS):
'a',
'</p>',
'or',
'taken',
'from',
])

# these final holders are ignored.
Expand Down Expand Up @@ -4453,7 +4505,7 @@ def remove_code_comment_markers(s):
Return ``s`` removing code comments such as C and C++ style comment markers and assimilated
>>> remove_code_comment_markers(r"\\*#%; /\\/*a*/b/*c\\d#e%f \\*#%; /")
'a b c\\\d e f'
'a b c\\\\d e f'
"""
return (s
.replace('/*', ' ')
Expand Down Expand Up @@ -4529,7 +4581,7 @@ def prepare_text_line(line):
.replace('\\XA9', ' (c) ')
.replace('\\A9', ' (c) ')
.replace('\\a9', ' (c) ')
.replace('<A9>', ' (c) ')
.replace('<A9>', ' (c) ')
.replace('XA9;', ' (c) ')
.replace('Xa9;', ' (c) ')
.replace('xA9;', ' (c) ')
Expand Down Expand Up @@ -4580,7 +4632,7 @@ def prepare_text_line(line):
.replace('year>', " ")
.replace('<year>', " ")
.replace('<name>', " ")

)

if TRACE_TOK:
Expand Down
Loading

0 comments on commit 6126c28

Please sign in to comment.