forked from aboutcode-org/scancode-toolkit
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
aboutcode-org#3659 Fixed copyright detection normalization and Move n…
…ormalization to copyrights.py and unit tests passed
- Loading branch information
Showing
1,040 changed files
with
103,975 additions
and
405 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,7 +25,6 @@ | |
from pygmars import Token | ||
from pygmars.tree import Tree | ||
|
||
|
||
from cluecode import copyrights_hint | ||
from textcode.markup import strip_known_markup_from_text | ||
|
||
|
@@ -162,8 +161,24 @@ def detect_copyrights( | |
Strip markup from text if ``demarkup`` is True. | ||
Run for up to ``deadline`` seconds and return results found so far. | ||
""" | ||
from cluecode.linux_credits import detect_credits_authors | ||
|
||
from textcode.analysis import numbered_text_lines | ||
|
||
if include_authors: | ||
author_detections = list(detect_credits_authors(location)) | ||
|
||
if TRACE: | ||
logger_debug('detect_copyrights: detect_credits_authors') | ||
for detecta in author_detections: | ||
logger_debug(f' {detecta}') | ||
|
||
# bail out if we have a credits file with credits | ||
if author_detections: | ||
for a in author_detections: | ||
yield a | ||
return | ||
|
||
numbered_lines = list(numbered_text_lines(location, demarkup=True)) | ||
|
||
if TRACE or TRACE_TOK: | ||
|
@@ -716,8 +731,9 @@ def build_detection_from_node( | |
# Slovenian: avtorske pravice | ||
# Ukrainian: авторське право | ||
|
||
# rare typo copyrighy | ||
# rare typos incopyright | ||
(r'^Copyrighy$', 'COPY'), | ||
(r'^Copyirght$', 'COPY'), | ||
|
||
# OSGI | ||
(r'^Bundle-Copyright', 'COPY'), | ||
|
@@ -959,6 +975,7 @@ def build_detection_from_node( | |
(r'^[Ss]tring$', 'JUNK'), | ||
(r'^Implementation-Vendor$', 'JUNK'), | ||
(r'^dnl$', 'JUNK'), | ||
(r'^ifndef$', 'JUNK'), | ||
|
||
(r'^as$', 'NN'), | ||
(r'^[Vv]isit$', 'JUNK'), | ||
|
@@ -994,7 +1011,6 @@ def build_detection_from_node( | |
(r'^Add$', 'JUNK'), | ||
(r'^Average$', 'JUNK'), | ||
(r'^Taken$', 'JUNK'), | ||
(r'^LAWS\.?$', 'JUNK'), | ||
(r'^design$', 'JUNK'), | ||
(r'^Driver$', 'JUNK'), | ||
(r'^[Cc]ontribution\.?', 'JUNK'), | ||
|
@@ -1004,7 +1020,7 @@ def build_detection_from_node( | |
(r'^Last-Translator$', 'JUNK'), | ||
(r'^Translated$', 'JUNK'), | ||
(r'^OMAP730$', 'JUNK'), | ||
(r'^Law\.$', 'JUNK'), | ||
|
||
(r'^dylid$', 'JUNK'), | ||
(r'^BeOS$', 'JUNK'), | ||
(r'^Generates?$', 'JUNK'), | ||
|
@@ -1046,7 +1062,6 @@ def build_detection_from_node( | |
(r'^Disclaimer$', 'JUNK'), | ||
(r'^Directive.?$', 'JUNK'), | ||
(r'^LAWS\,?$', 'JUNK'), | ||
(r'^[Ll]aws?,?$', 'JUNK'), | ||
(r'^me$', 'JUNK'), | ||
(r'^Derived$', 'JUNK'), | ||
(r'^Limitations?$', 'JUNK'), | ||
|
@@ -1117,7 +1132,15 @@ def build_detection_from_node( | |
(r'^Much$', 'JUNK'), | ||
(r'^remains?,?$', 'JUNK'), | ||
(r'^earlier$', 'JUNK'), | ||
(r'^[lL]aws?$', 'JUNK'), | ||
|
||
# there is a Mr. Law | ||
(r'^Law[\.,]?$', 'NN'), | ||
(r'^laws?[\.,]?$', 'JUNK'), | ||
(r'^Laws[\.,]?$', 'JUNK'), | ||
(r'^LAWS?[\.,]?$', 'JUNK'), | ||
(r'^LAWS?$', 'NN'), | ||
|
||
(r'^taken$', 'NN'), | ||
(r'^Insert$', 'JUNK'), | ||
(r'^url$', 'JUNK'), | ||
(r'^[Ss]ee$', 'JUNK'), | ||
|
@@ -1138,6 +1161,7 @@ def build_detection_from_node( | |
(r'^[Ii]nterfaces?,?$', 'JUNK'), | ||
(r'^than$', 'JUNK'), | ||
(r'^whom$', 'JUNK'), | ||
(r'^Definitions?$', 'JUNK'), | ||
(r'^However,?$', 'JUNK'), | ||
(r'^[Cc]ollectively$', 'JUNK'), | ||
(r'^following$', 'FOLLOWING'), | ||
|
@@ -1245,7 +1269,8 @@ def build_detection_from_node( | |
(r'^[a-z]{3,10}[A-Z][a-z]{3,10}$', 'JUNK'), | ||
|
||
(r'^\$?Guid$', 'JUNK'), | ||
#(r'^Small$', 'NN'), | ||
# there is a Mr Small | ||
# (r'^Small$', 'NN'), | ||
(r'^implementing$', 'JUNK'), | ||
(r'^Unlike$', 'JUNK'), | ||
(r'^using$', 'JUNK'), | ||
|
@@ -1326,6 +1351,7 @@ def build_detection_from_node( | |
(r'^[Ss]tatements?.?$', 'JUNK'), | ||
(r'^issues?.?$', 'JUNK'), | ||
(r'^retain?.?$', 'JUNK'), | ||
(r'^Sun3x$', 'JUNK'), | ||
|
||
############################################################################ | ||
# Nouns and proper Nouns | ||
|
@@ -1336,7 +1362,7 @@ def build_detection_from_node( | |
(r'^This_file_is_part_of_KDE$', 'NAME'), | ||
|
||
# K.K. (a company suffix), needs special handling | ||
(r'^K.K.,?$', 'NAME'), | ||
(r'^K.K.,?$', 'COMP'), | ||
|
||
# MIT is problematic | ||
# With a comma, always CAPS (MIT alone is too error prone to be always tagged as CAPS | ||
|
@@ -1417,6 +1443,7 @@ def build_detection_from_node( | |
(r'^DATED$', 'NN'), | ||
(r'^Delay', 'NN'), | ||
(r'^Derivative', 'NN'), | ||
(r'^Direct$', 'NN'), | ||
(r'^DISCLAIMED', 'NN'), | ||
(r'^Docs?$', 'NN'), | ||
(r'^DOCUMENTATION', 'NN'), | ||
|
@@ -1506,10 +1533,13 @@ def build_detection_from_node( | |
(r'^GPLd?\.?$', 'NN'), | ||
(r'^GPL\'d$', 'NN'), | ||
(r'^Gnome$', 'NN'), | ||
(r'^Port$', 'NN'), | ||
(r'^GnuPG$', 'NN'), | ||
(r'^Government.', 'NNP'), | ||
(r'^OProfile$', 'NNP'), | ||
(r'^Government$', 'COMP'), | ||
# there is a Ms. Grant | ||
(r'^Grant$', 'NNP'), | ||
(r'^Grants?\.?,?$', 'NN'), | ||
(r'^Header', 'NN'), | ||
(r'^HylaFAX$', 'NN'), | ||
|
@@ -1546,7 +1576,6 @@ def build_detection_from_node( | |
(r'^List$', 'NN'), | ||
(r'^Set$', 'NN'), | ||
(r'^Last$', 'NN'), | ||
(r'^LAW', 'NN'), | ||
(r'^Legal$', 'NN'), | ||
(r'^LegalTrademarks$', 'NN'), | ||
(r'^Library$', 'NN'), | ||
|
@@ -1699,6 +1728,11 @@ def build_detection_from_node( | |
(r'^CodeMirror$', 'NN'), | ||
(r'^They$', 'JUNK'), | ||
(r'^Branched$', 'NN'), | ||
(r'^Partial$', 'NN'), | ||
(r'^Fixed$', 'NN'), | ||
(r'^Later$', 'NN'), | ||
(r'^Rear$', 'NN'), | ||
(r'^Left$', 'NN'), | ||
|
||
(r'^Improved$', 'NN'), | ||
(r'^Designed$', 'NN'), | ||
|
@@ -1767,11 +1801,12 @@ def build_detection_from_node( | |
(r'^Compression$', 'NN'), | ||
(r'^Letter$', 'NN'), | ||
(r'^Moved$', 'NN'), | ||
(r'^More$', 'NN'), | ||
(r'^Phone$', 'NN'), | ||
(r'^[Tt]ests?$', 'JUNK'), | ||
|
||
(r'^Inputs?$', 'NN'), | ||
|
||
|
||
# dual caps that are not NNP | ||
(r'^Make[A-Z]', 'JUNK'), | ||
(r'^Create[A-Z]', 'JUNK'), | ||
|
@@ -1959,12 +1994,11 @@ def build_detection_from_node( | |
(r'^(S\.?A\.?S?|Sas|sas|A\/S|AG,?|AB|Labs?|[Cc][Oo]|Research|Center|INRIA|Societe|KG)[,\.]?$', 'COMP'), | ||
# French SARL | ||
(r'^(SARL|S\.A\.R\.L\.)[\.,\)]*$', 'COMP'), | ||
# More company suffix : a.s. in Czechia and otehrs | ||
# More company suffix : a.s. in Czechia and others | ||
(r'^(a\.s\.|S\.r\.l\.?)$', 'COMP'), | ||
(r'^Vertriebsges\.m\.b\.H\.?,?$', 'COMP'), | ||
# Iceland | ||
(r'^(ehf|hf|svf|ohf)\.,?$', 'COMP'), | ||
|
||
# Move company abbreviations | ||
(r'^(SPRL|srl)[\.,]?$', 'COMP'), | ||
# Poland | ||
|
@@ -2231,6 +2265,7 @@ def build_detection_from_node( | |
(r'^Meridian\'93$', 'NNP'), | ||
(r'^Xiph.Org$', 'NNP'), | ||
(r'^iClick,?$', 'NNP'), | ||
(r'^electronics?$', 'NNP'), | ||
|
||
# proper nouns with digits | ||
(r'^([A-Z][a-z0-9]+){1,2}[\.,]?$', 'NNP'), | ||
|
@@ -2258,6 +2293,9 @@ def build_detection_from_node( | |
(r'^AT$', '<at>'), | ||
(r'^DOT$', 'DOT'), | ||
|
||
# exceptions to CAPS | ||
(r'^MMC$', 'JUNK'), | ||
|
||
# all CAPS word, at least 1 char long such as MIT, including an optional trailing comma or dot | ||
(r'^[A-Z0-9]+,?$', 'CAPS'), | ||
|
||
|
@@ -2327,10 +2365,9 @@ def build_detection_from_node( | |
(r'__MyCompanyName__[\.,]?$', 'NAME'), | ||
|
||
# email in brackets <brett_AT_jdom_DOT_org> | ||
#(karl AT indy.rr.com) | ||
#<fdlibm-comments AT sun.com> | ||
# (karl AT indy.rr.com) | ||
# <fdlibm-comments AT sun.com> | ||
(r'(?i:^[<\(][\w\.\-\+]+at[\w\.\-\+]+(dot)?[\w\.\-\+]+[/)>]$)', 'EMAIL'), | ||
|
||
|
||
# Code variable names including snake case | ||
(r'^.*(_.*)+$', 'JUNK'), | ||
|
@@ -2366,7 +2403,6 @@ def build_detection_from_node( | |
(r'^(?:=>|->|<-|<=)$', 'JUNK'), | ||
|
||
(r'^semiconductors?[\.,]?$', 'NNP'), | ||
|
||
|
||
############################################################################ | ||
# catch all other as Nouns | ||
|
@@ -2644,6 +2680,7 @@ def build_detection_from_node( | |
NAME-YEAR: {<YR-RANGE> <NAME-EMAIL|COMPANY>+ <CC> <YR-RANGE>} #540 | ||
NAME: {<NAME|NAME-EMAIL>+ <OF> <NNP> <OF> <NN>? <COMPANY>} #550 | ||
NAME: {<NAME|NAME-EMAIL>+ <CC|OF>? <NAME|NAME-EMAIL|COMPANY>} #560 | ||
NAME: {<NNP><NNP>} #561 | ||
|
@@ -2677,8 +2714,13 @@ def build_detection_from_node( | |
#also accept trailing email and URLs | ||
# and "VAN" e.g. Du: Copyright (c) 2008 Alek Du <[email protected]> | ||
NAME-YEAR: {<NAME-YEAR> <VAN>? <EMAIL>?<URL>?} #5701 | ||
# Copyright (C) 2008 Jim Law - Iris LP All rights reserved. | ||
NAME-YEAR: {<NAME-YEAR> <NN> <DASH> <NAME>} # 5701.1 | ||
NAME-YEAR: {<NAME-YEAR>+} #5702 | ||
NAME: {<NNP> <OF> <NNP>} #580 | ||
NAME: {<NAME> <NNP>} #590 | ||
NAME: {<NN|NNP|CAPS>+ <CC> <OTH>} #600 | ||
|
@@ -2898,6 +2940,10 @@ def build_detection_from_node( | |
# Copyright (c) 2013-2015 Streams Standard Reference Implementation Authors | ||
COPYRIGHT: {<COPY>+ <NAME-YEAR> <NN|NNP>+ <AUTHS>} #1566 | ||
# Nicolas Pitre, (c) 2002 Monta Vista Software Inc | ||
# Cliff Brake, (c) 2001 | ||
#COPYRIGHT: {<NAME> <COPY> <NAME-YEAR> <NAME> <COPY> <YR-RANGE>} #1566.1 | ||
# copyright: Copyright (c) Joe Joyce and contributors, 2016-2019. | ||
COPYRIGHT: {<COPY>+ <NAME> <CC> <NN> <YR-RANGE>} #1579992 | ||
|
@@ -3082,8 +3128,11 @@ def build_detection_from_node( | |
# Author: Jeff LaBundy <[email protected]> | ||
COPYRIGHT: {<COPY> <COPY> <YR-RANGE> <AUTH> <NAME-EMAIL>} #2280-3 | ||
COPYRIGHT2: {<COPY>+ <NN|CAPS>? <YR-RANGE>+ <PN>*} #2280 | ||
COPYRIGHT: {<COPYRIGHT2> <BY> <NAME-YEAR|NAME-EMAIL> <BY>? <NAME-YEAR|NAME-EMAIL>? } #2280-4 | ||
# using #2280 above: Copyright 2018 Developers of the Rand project | ||
COPYRIGHT: {<COPYRIGHT2> <MAINT> <OF> <COMPANY>} #2280.123 | ||
|
@@ -3206,7 +3255,8 @@ def build_detection_from_node( | |
COPYRIGHT: {<COPYRIGHT2> <CAPS> <CD|CDS> <COMPANY> <NAME>} #2009.1 | ||
# COPYRIGHT (c) 2006 - 2009 DIONYSOS | ||
COPYRIGHT: {<COPYRIGHT2> <CAPS>} #2009 | ||
# Copyright 2003 ICT CAS | ||
COPYRIGHT: {<COPYRIGHT2> <CAPS>+} #2009 | ||
# Copyright (C) 2000 See Beyond Communications Corporation | ||
COPYRIGHT2: {<COPYRIGHT2> <JUNK> <COMPANY>} # 2010 | ||
|
@@ -3404,7 +3454,7 @@ def build_detection_from_node( | |
#Copyright (C) 2012-2016 by the following authors: | ||
#- Wladimir J. van der Laan <[email protected]> | ||
NAME-EMAIL: {<NNP> <NAME-EMAIL> } #157999.13 | ||
NAME-EMAIL: {<NNP> <NAME-EMAIL> } #157999.13 | ||
NAME-EMAIL: {<DASH> <NAME-EMAIL> <NN>?} #157999.14 | ||
COPYRIGHT: {<COPYRIGHT2> <FOLLOWING> <AUTHS> <NAME-EMAIL>+ } #157999.14 | ||
|
@@ -3943,6 +3993,8 @@ def is_junk_copyright(s, patterns=COPYRIGHTS_JUNK_PATTERN_MATCHERS): | |
'a', | ||
'</p>', | ||
'or', | ||
'taken', | ||
'from', | ||
]) | ||
|
||
# these final holders are ignored. | ||
|
@@ -4453,7 +4505,7 @@ def remove_code_comment_markers(s): | |
Return ``s`` removing code comments such as C and C++ style comment markers and assimilated | ||
>>> remove_code_comment_markers(r"\\*#%; /\\/*a*/b/*c\\d#e%f \\*#%; /") | ||
'a b c\\\d e f' | ||
'a b c\\\\d e f' | ||
""" | ||
return (s | ||
.replace('/*', ' ') | ||
|
@@ -4529,7 +4581,7 @@ def prepare_text_line(line): | |
.replace('\\XA9', ' (c) ') | ||
.replace('\\A9', ' (c) ') | ||
.replace('\\a9', ' (c) ') | ||
.replace('<A9>', ' (c) ') | ||
.replace('<A9>', ' (c) ') | ||
.replace('XA9;', ' (c) ') | ||
.replace('Xa9;', ' (c) ') | ||
.replace('xA9;', ' (c) ') | ||
|
@@ -4580,7 +4632,7 @@ def prepare_text_line(line): | |
.replace('year>', " ") | ||
.replace('<year>', " ") | ||
.replace('<name>', " ") | ||
|
||
) | ||
|
||
if TRACE_TOK: | ||
|
Oops, something went wrong.