Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve Copyright Detection #3929

Merged
merged 6 commits into from
Oct 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
94 changes: 73 additions & 21 deletions src/cluecode/copyrights.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from pygmars import Token
from pygmars.tree import Tree


from cluecode import copyrights_hint
from textcode.markup import strip_known_markup_from_text

Expand Down Expand Up @@ -107,8 +106,24 @@ def detect_copyrights(
Strip markup from text if ``demarkup`` is True.
Run for up to ``deadline`` seconds and return results found so far.
"""
from cluecode.linux_credits import detect_credits_authors

from textcode.analysis import numbered_text_lines

if include_authors:
author_detections = list(detect_credits_authors(location))

if TRACE:
logger_debug('detect_copyrights: detect_credits_authors')
for detecta in author_detections:
logger_debug(f' {detecta}')

# bail out if we have a credits file with credits
if author_detections:
for a in author_detections:
yield a
return

numbered_lines = list(numbered_text_lines(location, demarkup=True))

if TRACE or TRACE_TOK:
Expand Down Expand Up @@ -661,8 +676,9 @@ def build_detection_from_node(
# Slovenian: avtorske pravice
# Ukrainian: авторське право

# rare typo copyrighy
# rare typos incopyright
(r'^Copyrighy$', 'COPY'),
(r'^Copyirght$', 'COPY'),

# OSGI
(r'^Bundle-Copyright', 'COPY'),
Expand Down Expand Up @@ -904,6 +920,7 @@ def build_detection_from_node(
(r'^[Ss]tring$', 'JUNK'),
(r'^Implementation-Vendor$', 'JUNK'),
(r'^dnl$', 'JUNK'),
(r'^ifndef$', 'JUNK'),

(r'^as$', 'NN'),
(r'^[Vv]isit$', 'JUNK'),
Expand Down Expand Up @@ -939,7 +956,6 @@ def build_detection_from_node(
(r'^Add$', 'JUNK'),
(r'^Average$', 'JUNK'),
(r'^Taken$', 'JUNK'),
(r'^LAWS\.?$', 'JUNK'),
(r'^design$', 'JUNK'),
(r'^Driver$', 'JUNK'),
(r'^[Cc]ontribution\.?', 'JUNK'),
Expand All @@ -949,7 +965,7 @@ def build_detection_from_node(
(r'^Last-Translator$', 'JUNK'),
(r'^Translated$', 'JUNK'),
(r'^OMAP730$', 'JUNK'),
(r'^Law\.$', 'JUNK'),

(r'^dylid$', 'JUNK'),
(r'^BeOS$', 'JUNK'),
(r'^Generates?$', 'JUNK'),
Expand Down Expand Up @@ -991,7 +1007,6 @@ def build_detection_from_node(
(r'^Disclaimer$', 'JUNK'),
(r'^Directive.?$', 'JUNK'),
(r'^LAWS\,?$', 'JUNK'),
(r'^[Ll]aws?,?$', 'JUNK'),
(r'^me$', 'JUNK'),
(r'^Derived$', 'JUNK'),
(r'^Limitations?$', 'JUNK'),
Expand Down Expand Up @@ -1062,7 +1077,15 @@ def build_detection_from_node(
(r'^Much$', 'JUNK'),
(r'^remains?,?$', 'JUNK'),
(r'^earlier$', 'JUNK'),
(r'^[lL]aws?$', 'JUNK'),

# there is a Mr. Law
(r'^Law[\.,]?$', 'NN'),
(r'^laws?[\.,]?$', 'JUNK'),
(r'^Laws[\.,]?$', 'JUNK'),
(r'^LAWS?[\.,]?$', 'JUNK'),
(r'^LAWS?$', 'NN'),

(r'^taken$', 'NN'),
(r'^Insert$', 'JUNK'),
(r'^url$', 'JUNK'),
(r'^[Ss]ee$', 'JUNK'),
Expand All @@ -1083,6 +1106,7 @@ def build_detection_from_node(
(r'^[Ii]nterfaces?,?$', 'JUNK'),
(r'^than$', 'JUNK'),
(r'^whom$', 'JUNK'),
(r'^Definitions?$', 'JUNK'),
(r'^However,?$', 'JUNK'),
(r'^[Cc]ollectively$', 'JUNK'),
(r'^following$', 'FOLLOWING'),
Expand Down Expand Up @@ -1190,7 +1214,8 @@ def build_detection_from_node(
(r'^[a-z]{3,10}[A-Z][a-z]{3,10}$', 'JUNK'),

(r'^\$?Guid$', 'JUNK'),
#(r'^Small$', 'NN'),
# there is a Mr Small
# (r'^Small$', 'NN'),
(r'^implementing$', 'JUNK'),
(r'^Unlike$', 'JUNK'),
(r'^using$', 'JUNK'),
Expand Down Expand Up @@ -1271,6 +1296,7 @@ def build_detection_from_node(
(r'^[Ss]tatements?.?$', 'JUNK'),
(r'^issues?.?$', 'JUNK'),
(r'^retain?.?$', 'JUNK'),
(r'^Sun3x$', 'JUNK'),

############################################################################
# Nouns and proper Nouns
Expand All @@ -1281,7 +1307,7 @@ def build_detection_from_node(
(r'^This_file_is_part_of_KDE$', 'NAME'),

# K.K. (a company suffix), needs special handling
(r'^K.K.,?$', 'NAME'),
(r'^K.K.,?$', 'COMP'),

# MIT is problematic
# With a comma, always CAPS (MIT alone is too error prone to be always tagged as CAPS
Expand Down Expand Up @@ -1362,6 +1388,7 @@ def build_detection_from_node(
(r'^DATED$', 'NN'),
(r'^Delay', 'NN'),
(r'^Derivative', 'NN'),
(r'^Direct$', 'NN'),
(r'^DISCLAIMED', 'NN'),
(r'^Docs?$', 'NN'),
(r'^DOCUMENTATION', 'NN'),
Expand Down Expand Up @@ -1451,10 +1478,13 @@ def build_detection_from_node(
(r'^GPLd?\.?$', 'NN'),
(r'^GPL\'d$', 'NN'),
(r'^Gnome$', 'NN'),
(r'^Port$', 'NN'),
(r'^GnuPG$', 'NN'),
(r'^Government.', 'NNP'),
(r'^OProfile$', 'NNP'),
(r'^Government$', 'COMP'),
# there is a Ms. Grant
(r'^Grant$', 'NNP'),
(r'^Grants?\.?,?$', 'NN'),
(r'^Header', 'NN'),
(r'^HylaFAX$', 'NN'),
Expand Down Expand Up @@ -1491,7 +1521,6 @@ def build_detection_from_node(
(r'^List$', 'NN'),
(r'^Set$', 'NN'),
(r'^Last$', 'NN'),
(r'^LAW', 'NN'),
(r'^Legal$', 'NN'),
(r'^LegalTrademarks$', 'NN'),
(r'^Library$', 'NN'),
Expand Down Expand Up @@ -1644,6 +1673,11 @@ def build_detection_from_node(
(r'^CodeMirror$', 'NN'),
(r'^They$', 'JUNK'),
(r'^Branched$', 'NN'),
(r'^Partial$', 'NN'),
(r'^Fixed$', 'NN'),
(r'^Later$', 'NN'),
(r'^Rear$', 'NN'),
(r'^Left$', 'NN'),

(r'^Improved$', 'NN'),
(r'^Designed$', 'NN'),
Expand Down Expand Up @@ -1712,11 +1746,12 @@ def build_detection_from_node(
(r'^Compression$', 'NN'),
(r'^Letter$', 'NN'),
(r'^Moved$', 'NN'),
(r'^More$', 'NN'),
(r'^Phone$', 'NN'),
(r'^[Tt]ests?$', 'JUNK'),

(r'^Inputs?$', 'NN'),


# dual caps that are not NNP
(r'^Make[A-Z]', 'JUNK'),
(r'^Create[A-Z]', 'JUNK'),
Expand Down Expand Up @@ -1904,12 +1939,11 @@ def build_detection_from_node(
(r'^(S\.?A\.?S?|Sas|sas|A\/S|AG,?|AB|Labs?|[Cc][Oo]|Research|Center|INRIA|Societe|KG)[,\.]?$', 'COMP'),
# French SARL
(r'^(SARL|S\.A\.R\.L\.)[\.,\)]*$', 'COMP'),
# More company suffix : a.s. in Czechia and otehrs
# More company suffix : a.s. in Czechia and others
(r'^(a\.s\.|S\.r\.l\.?)$', 'COMP'),
(r'^Vertriebsges\.m\.b\.H\.?,?$', 'COMP'),
# Iceland
(r'^(ehf|hf|svf|ohf)\.,?$', 'COMP'),

# Move company abbreviations
(r'^(SPRL|srl)[\.,]?$', 'COMP'),
# Poland
Expand Down Expand Up @@ -2176,6 +2210,7 @@ def build_detection_from_node(
(r'^Meridian\'93$', 'NNP'),
(r'^Xiph.Org$', 'NNP'),
(r'^iClick,?$', 'NNP'),
(r'^electronics?$', 'NNP'),

# proper nouns with digits
(r'^([A-Z][a-z0-9]+){1,2}[\.,]?$', 'NNP'),
Expand Down Expand Up @@ -2203,6 +2238,9 @@ def build_detection_from_node(
(r'^AT$', '<at>'),
(r'^DOT$', 'DOT'),

# exceptions to CAPS
(r'^MMC$', 'JUNK'),

# all CAPS word, at least 1 char long such as MIT, including an optional trailing comma or dot
(r'^[A-Z0-9]+,?$', 'CAPS'),

Expand Down Expand Up @@ -2272,10 +2310,9 @@ def build_detection_from_node(
(r'__MyCompanyName__[\.,]?$', 'NAME'),

# email in brackets <brett_AT_jdom_DOT_org>
#(karl AT indy.rr.com)
#<fdlibm-comments AT sun.com>
# (karl AT indy.rr.com)
# <fdlibm-comments AT sun.com>
(r'(?i:^[<\(][\w\.\-\+]+at[\w\.\-\+]+(dot)?[\w\.\-\+]+[/)>]$)', 'EMAIL'),


# Code variable names including snake case
(r'^.*(_.*)+$', 'JUNK'),
Expand Down Expand Up @@ -2311,7 +2348,6 @@ def build_detection_from_node(
(r'^(?:=>|->|<-|<=)$', 'JUNK'),

(r'^semiconductors?[\.,]?$', 'NNP'),


############################################################################
# catch all other as Nouns
Expand Down Expand Up @@ -2589,6 +2625,7 @@ def build_detection_from_node(
NAME-YEAR: {<YR-RANGE> <NAME-EMAIL|COMPANY>+ <CC> <YR-RANGE>} #540

NAME: {<NAME|NAME-EMAIL>+ <OF> <NNP> <OF> <NN>? <COMPANY>} #550

NAME: {<NAME|NAME-EMAIL>+ <CC|OF>? <NAME|NAME-EMAIL|COMPANY>} #560

NAME: {<NNP><NNP>} #561
Expand Down Expand Up @@ -2622,8 +2659,13 @@ def build_detection_from_node(
#also accept trailing email and URLs
# and "VAN" e.g. Du: Copyright (c) 2008 Alek Du <[email protected]>
NAME-YEAR: {<NAME-YEAR> <VAN>? <EMAIL>?<URL>?} #5701

# Copyright (C) 2008 Jim Law - Iris LP All rights reserved.
NAME-YEAR: {<NAME-YEAR> <NN> <DASH> <NAME>} # 5701.1

NAME-YEAR: {<NAME-YEAR>+} #5702


NAME: {<NNP> <OF> <NNP>} #580
NAME: {<NAME> <NNP>} #590
NAME: {<NN|NNP|CAPS>+ <CC> <OTH>} #600
Expand Down Expand Up @@ -2843,6 +2885,10 @@ def build_detection_from_node(
# Copyright (c) 2013-2015 Streams Standard Reference Implementation Authors
COPYRIGHT: {<COPY>+ <NAME-YEAR> <NN|NNP>+ <AUTHS>} #1566

# Nicolas Pitre, (c) 2002 Monta Vista Software Inc
# Cliff Brake, (c) 2001
#COPYRIGHT: {<NAME> <COPY> <NAME-YEAR> <NAME> <COPY> <YR-RANGE>} #1566.1

# copyright: Copyright (c) Joe Joyce and contributors, 2016-2019.
COPYRIGHT: {<COPY>+ <NAME> <CC> <NN> <YR-RANGE>} #1579992

Expand Down Expand Up @@ -3027,8 +3073,11 @@ def build_detection_from_node(
# Author: Jeff LaBundy <[email protected]>
COPYRIGHT: {<COPY> <COPY> <YR-RANGE> <AUTH> <NAME-EMAIL>} #2280-3


COPYRIGHT2: {<COPY>+ <NN|CAPS>? <YR-RANGE>+ <PN>*} #2280

COPYRIGHT: {<COPYRIGHT2> <BY> <NAME-YEAR|NAME-EMAIL> <BY>? <NAME-YEAR|NAME-EMAIL>? } #2280-4

# using #2280 above: Copyright 2018 Developers of the Rand project
COPYRIGHT: {<COPYRIGHT2> <MAINT> <OF> <COMPANY>} #2280.123

Expand Down Expand Up @@ -3151,7 +3200,8 @@ def build_detection_from_node(
COPYRIGHT: {<COPYRIGHT2> <CAPS> <CD|CDS> <COMPANY> <NAME>} #2009.1

# COPYRIGHT (c) 2006 - 2009 DIONYSOS
COPYRIGHT: {<COPYRIGHT2> <CAPS>} #2009
# Copyright 2003 ICT CAS
COPYRIGHT: {<COPYRIGHT2> <CAPS>+} #2009

# Copyright (C) 2000 See Beyond Communications Corporation
COPYRIGHT2: {<COPYRIGHT2> <JUNK> <COMPANY>} # 2010
Expand Down Expand Up @@ -3349,7 +3399,7 @@ def build_detection_from_node(
#Copyright (C) 2012-2016 by the following authors:
#- Wladimir J. van der Laan <[email protected]>

NAME-EMAIL: {<NNP> <NAME-EMAIL> } #157999.13
NAME-EMAIL: {<NNP> <NAME-EMAIL> } #157999.13
NAME-EMAIL: {<DASH> <NAME-EMAIL> <NN>?} #157999.14
COPYRIGHT: {<COPYRIGHT2> <FOLLOWING> <AUTHS> <NAME-EMAIL>+ } #157999.14

Expand Down Expand Up @@ -3888,6 +3938,8 @@ def is_junk_copyright(s, patterns=COPYRIGHTS_JUNK_PATTERN_MATCHERS):
'a',
'</p>',
'or',
'taken',
'from',
])

# these final holders are ignored.
Expand Down Expand Up @@ -4398,7 +4450,7 @@ def remove_code_comment_markers(s):
Return ``s`` removing code comments such as C and C++ style comment markers and assimilated

>>> remove_code_comment_markers(r"\\*#%; /\\/*a*/b/*c\\d#e%f \\*#%; /")
'a b c\\\d e f'
'a b c\\\\d e f'
"""
return (s
.replace('/*', ' ')
Expand Down Expand Up @@ -4474,7 +4526,7 @@ def prepare_text_line(line):
.replace('\\XA9', ' (c) ')
.replace('\\A9', ' (c) ')
.replace('\\a9', ' (c) ')
.replace('<A9>', ' (c) ')
.replace('<A9>', ' (c) ')
.replace('XA9;', ' (c) ')
.replace('Xa9;', ' (c) ')
.replace('xA9;', ' (c) ')
Expand Down Expand Up @@ -4525,7 +4577,7 @@ def prepare_text_line(line):
.replace('year>', " ")
.replace('<year>', " ")
.replace('<name>', " ")

)

if TRACE_TOK:
Expand Down
Loading
Loading