Skip to content

Commit

Permalink
Merge pull request #3917 from aboutcode-org/misc-copyrights2
Browse files Browse the repository at this point in the history
Improve copyrights detection more
  • Loading branch information
pombredanne authored Sep 12, 2024
2 parents 6e756c4 + 645ac27 commit 498467c
Show file tree
Hide file tree
Showing 46 changed files with 533 additions and 38 deletions.
27 changes: 27 additions & 0 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,33 @@ jobs:
tests/licensedcode/test_detection_validate.py \
-k TestValidateLicenseExtended5
license_validate_ignorables_1: |
venv/bin/pytest -n 3 -vvs --test-suite=validate \
tests/licensedcode/test_detection_validate.py \
-k TestValidateLicenseIgnorableClues1
license_validate_ignorables_2: |
venv/bin/pytest -n 3 -vvs --test-suite=validate \
tests/licensedcode/test_detection_validate.py \
-k TestValidateLicenseIgnorableClues2
license_validate_ignorables_3: |
venv/bin/pytest -n 3 -vvs --test-suite=validate \
tests/licensedcode/test_detection_validate.py \
-k TestValidateLicenseIgnorableClues3
license_validate_ignorables_4: |
venv/bin/pytest -n 3 -vvs --test-suite=validate \
tests/licensedcode/test_detection_validate.py \
-k TestValidateLicenseIgnorableClues4
license_validate_ignorables_5: |
venv/bin/pytest -n 3 -vvs --test-suite=validate \
tests/licensedcode/test_detection_validate.py \
-k TestValidateLicenseIgnorableClues5
license_cache: |
venv/bin/pytest -n 3 -vvs --test-suite=all \
tests/licensedcode/test_zzzz_cache.py --reruns 2
Expand Down
89 changes: 72 additions & 17 deletions src/cluecode/copyrights.py
Original file line number Diff line number Diff line change
Expand Up @@ -806,6 +806,12 @@ def build_detection_from_node(
# verbatime star
(r'^\*$', 'JUNK'),

# misc company names exception to next rule
(r'^TinCanTools$', 'NNP'),
(r'^SoftwareBitMaker$', 'NNP'),
(r'^NetCommWireless$', 'NNP'),

# Repeated CamelCasedWords
(r'^([A-Z][a-z]+){3,}$', 'JUNK'),

############################################################################
Expand Down Expand Up @@ -1079,7 +1085,7 @@ def build_detection_from_node(
(r'^whom$', 'JUNK'),
(r'^However,?$', 'JUNK'),
(r'^[Cc]ollectively$', 'JUNK'),
(r'^following$', 'JUNK'),
(r'^following$', 'FOLLOWING'),
(r'^[Cc]onfig$', 'JUNK'),
(r'^file\.$', 'JUNK'),

Expand Down Expand Up @@ -1184,7 +1190,7 @@ def build_detection_from_node(
(r'^[a-z]{3,10}[A-Z][a-z]{3,10}$', 'JUNK'),

(r'^\$?Guid$', 'JUNK'),
(r'^Small$', 'NN'),
#(r'^Small$', 'NN'),
(r'^implementing$', 'JUNK'),
(r'^Unlike$', 'JUNK'),
(r'^using$', 'JUNK'),
Expand All @@ -1206,6 +1212,11 @@ def build_detection_from_node(
# single period
(r"^\.$", 'JUNK'),

# exception to the next rule

# by PaX Team
(r"PaX$", 'NN'),

# short mixed caps with trailing cap: ZoY
(r"[A-Z][a-z][A-Z]$", 'JUNK'),

Expand Down Expand Up @@ -1405,6 +1416,7 @@ def build_detection_from_node(
(r'^STA$', 'NN'),
(r'^Page$', 'NN'),
(r'^Todo/Under$', 'JUNK'),
(r'^Under$', 'NN'),

(r'^Interrupt$', 'NN'),
(r'^cleanups?$', 'JUNK'),
Expand Down Expand Up @@ -1668,6 +1680,8 @@ def build_detection_from_node(
(r'^([Mm]onday|[Tt]uesday|[Ww]ednesday|[Tt]hursday|[Ff]riday|[Ss]aturday|[Ss]unday),?$', 'DAY'),
(r'^(Mon|Tue|Wed|Thu|Fri|Sat|Sun|May),?$', 'NN'),

(r'^[Dd]ebugging$', 'JUNK'),

# misc words that are not NNs
# lowercase verbs ending in "ing"
(r'^[a-z]+ing$', 'NN'),
Expand Down Expand Up @@ -1700,6 +1714,9 @@ def build_detection_from_node(
(r'^Moved$', 'NN'),
(r'^Phone$', 'NN'),

(r'^Inputs?$', 'NN'),


# dual caps that are not NNP
(r'^Make[A-Z]', 'JUNK'),
(r'^Create[A-Z]', 'JUNK'),
Expand Down Expand Up @@ -2069,6 +2086,7 @@ def build_detection_from_node(
# and Spanish/French Da Siva and De Gaulle
(r'^(([Vv][ao]n)|[Dd][aeu])$', 'VAN'),

(r'^aan$', 'OF'),
(r'^van$', 'VAN'),
(r'^Van$', 'VAN'),
(r'^von$', 'VAN'),
Expand Down Expand Up @@ -2134,7 +2152,10 @@ def build_detection_from_node(
(r'^\$?date-of-software$', 'YR'),
(r'^\$?date-of-document$', 'YR'),

# cardinal numbers
# small-cardinal numbers, under 30
(r'^[0-3]?[0-9]?[\.,]?$', 'CDS'),

# all other cardinal numbers
(r'^-?[0-9]+(.[0-9]+)?[\.,]?$', 'CD'),

############################################################################
Expand Down Expand Up @@ -2179,6 +2200,7 @@ def build_detection_from_node(

# exceptions to CAPS used in obfuscated emails like in joe AT foo DOT com
(r'^AT$', 'AT'),
(r'^AT$', '<at>'),
(r'^DOT$', 'DOT'),

# all CAPS word, at least 1 char long such as MIT, including an optional trailing comma or dot
Expand Down Expand Up @@ -2288,6 +2310,9 @@ def build_detection_from_node(
# some punctuation combos
(r'^(?:=>|->|<-|<=)$', 'JUNK'),

(r'^semiconductors?[\.,]?$', 'NNP'),


############################################################################
# catch all other as Nouns
############################################################################
Expand All @@ -2308,17 +2333,21 @@ def build_detection_from_node(
YR-RANGE: {<YR>+ <CC>+ <YR>} #20
YR-RANGE: {<YR> <DASH|TO>* <YR|BARE-YR>+} #30
YR-RANGE: {<CD|BARE-YR>? <YR> <BARE-YR>?} #40
YR-RANGE: {<CD|CDS|BARE-YR>? <YR> <BARE-YR>?} #40
YR-RANGE: {<YR>+ <BARE-YR>? } #50
YR-AND: {<CC>? <YR>+ <CC>+ <YR>} #60
YR-RANGE: {<YR-AND>+} #70
YR-RANGE: {<YR-RANGE>+ <DASH|TO> <YR-RANGE>+} #71
YR-RANGE: {<YR-RANGE>+ <DASH>?} #72
# Copyright (c) 1999, 2000, 01, 03, 06 Ralf Baechle
YR-RANGE: {<YR-RANGE> <CD>+} #72.2
YR-RANGE: {<YR-RANGE> <CD|CDS>+} #72.2
CD: {<BARE-YR>} #bareyear
# 5 Jan 2003
YR-RANGE: {<CDS> <NNP> <YR-RANGE>} #72.3
#######################################
# All/No/Some Rights Reserved
#######################################
Expand All @@ -2343,6 +2372,9 @@ def build_detection_from_node(
# [email protected] or [email protected]
EMAIL: {<EMAIL> <NN> <EMAIL>} # email or email
# <srinivasa.deevi at conexant dot com>
EMAIL: {<EMAIL_START> <CC> <NN> <DOT> <NN> } #email with brackets
#######################################
# NAMES and COMPANIES
#######################################
Expand Down Expand Up @@ -2408,8 +2440,9 @@ def build_detection_from_node(
# AT&T Laboratories, Cambridge
COMPANY: {<COMP> <COMP> <NNP>} #145
COMPANY: {<COMP> <CD|CDS> <COMP>} #170
# rare "Software in the public interest, Inc."
COMPANY: {<COMP> <CD> <COMP>} #170
COMPANY: {<NNP> <IN><NN> <NNP> <NNP>+<COMP>?} #180
# Commonwealth Scientific and Industrial Research Organisation (CSIRO)
Expand Down Expand Up @@ -2558,18 +2591,21 @@ def build_detection_from_node(
NAME: {<NAME|NAME-EMAIL>+ <OF> <NNP> <OF> <NN>? <COMPANY>} #550
NAME: {<NAME|NAME-EMAIL>+ <CC|OF>? <NAME|NAME-EMAIL|COMPANY>} #560
NAME: {<NNP><NNP>} #5611
NAME: {<NNP><NNP>} #561
# strip Software from Copyright (c) Ian Darwin 1995. Software
NAME-YEAR: {<NAME>+ <YR-RANGE>} #5611
NAME-YEAR: {<NAME>+ <YR-RANGE>} #561.1
# Copyright 2018, OpenCensus Authors
COPYRIGHT: {<COPY>+ <YR-RANGE> <NNP> <AUTHS>} #1579991
COPYRIGHT: {<COPY>+ <YR-RANGE> <NNP> <AUTHS>} #561.2
# Tom aan de Wiel
NAME: {<NNP> <OF> <VAN> <NNP> } # 561.3
NAME-YEAR: {<YR-RANGE> <NNP>+ <CAPS>? <LINUX>?} #5612
NAME-YEAR: {<YR-RANGE> <NNP>+ <CAPS>? <LINUX>?} #562
#Academy of Motion Picture Arts and Sciences
NAME: {<NAME> <CC> <NNP>} #561
NAME: {<NAME> <CC> <NNP>} #563
# Adam Weinberger and the GNOME Foundation
ANDCO: {<CC> <NN> <COMPANY>} #565
Expand All @@ -2581,6 +2617,8 @@ def build_detection_from_node(
URL: {<PARENS> <URL> <PARENS>} #5700
NAME-YEAR: {<NAME-YEAR> <CDS> <NNP>} #5700.1
#also accept trailing email and URLs
# and "VAN" e.g. Du: Copyright (c) 2008 Alek Du <[email protected]>
NAME-YEAR: {<NAME-YEAR> <VAN>? <EMAIL>?<URL>?} #5701
Expand All @@ -2591,7 +2629,7 @@ def build_detection_from_node(
NAME: {<NN|NNP|CAPS>+ <CC> <OTH>} #600
NAME: {<NNP> <CAPS>} #610
NAME: {<CAPS> <DASH>? <NNP|NAME>} #620
NAME: {<NNP> <CD> <NNP>} #630
NAME: {<NNP> <CD|CDS> <NNP>} #630
NAME: {<COMP> <NAME>+} #640
# Copyright 2018-2019 @paritytech/substrate-light-ui authors & contributors
Expand Down Expand Up @@ -2983,7 +3021,11 @@ def build_detection_from_node(
# Russ Dill <[email protected]> 2001-2003
# Rewrited by Vladimir Oleynik <[email protected]> (C) 2003
COPYRIGHT: {<NAME-EMAIL> <YR-RANGE> <AUTH2> <BY> <NAME-EMAIL> <COPY> <YR-RANGE>} #22793.5
COPYRIGHT: {<NAME-EMAIL> <YR-RANGE> <AUTH2> <BY> <NAME-EMAIL> <COPY> <YR-RANGE>} #2280-2
# Copyright (C) 2018
# Author: Jeff LaBundy <[email protected]>
COPYRIGHT: {<COPY> <COPY> <YR-RANGE> <AUTH> <NAME-EMAIL>} #2280-3
COPYRIGHT2: {<COPY>+ <NN|CAPS>? <YR-RANGE>+ <PN>*} #2280
Expand Down Expand Up @@ -3106,7 +3148,7 @@ def build_detection_from_node(
COPYRIGHT: {<COPYRIGHT2> <CAPS|COMPANY> <NN|LINUX> <COMPANY>} #2008
# Copyright (c) 2016-2018 JSR 371 expert group and contributors
COPYRIGHT: {<COPYRIGHT2> <CAPS> <CD> <COMPANY> <NAME>} #2009.1
COPYRIGHT: {<COPYRIGHT2> <CAPS> <CD|CDS> <COMPANY> <NAME>} #2009.1
# COPYRIGHT (c) 2006 - 2009 DIONYSOS
COPYRIGHT: {<COPYRIGHT2> <CAPS>} #2009
Expand Down Expand Up @@ -3235,7 +3277,7 @@ def build_detection_from_node(
COPYRIGHT: {<COPY> <NNP> <NAME-YEAR> <COMPANY>?} #15720
# Copyright (c) 2008-1010 Intel Corporation
COPYRIGHT: {<COPY> <COPY> <CD> <COMPANY>} #rare-cd-not-year
COPYRIGHT: {<COPY> <COPY> <CD|CDS> <COMPANY>} #rare-cd-not-year
# Copyright (C) 2005-2006 dann frazier <[email protected]>
COPYRIGHT: {<COPYRIGHT2> <NN> <NN> <EMAIL>} #999991
Expand All @@ -3258,6 +3300,9 @@ def build_detection_from_node(
# copyrighted by the Open Source Vulnerability Database (http://osvdb.org)
COPYRIGHT: {<COPY> <BY> <NN|NNP>{3} <NAME>} #83002.1
# (C) by the respective authors,
<COPYRIGHT>: { <COPY> <BY> <NN> <NN> <AUTHDOT>} #83002.2
# weird //opylefted by <-Harvie 2oo7
COPYRIGHT: {<COPY> <BY> <NN> <NN> <MAINT>?} #83003
Expand Down Expand Up @@ -3301,6 +3346,14 @@ def build_detection_from_node(
# Gracenote Software, copyright © 2000-2008 Gracenote.
COPYRIGHT: {<COMPANY> <COPY>{1,2} <NAME-YEAR>} #157999.12
#Copyright (C) 2012-2016 by the following authors:
#- Wladimir J. van der Laan <[email protected]>
NAME-EMAIL: {<NNP> <NAME-EMAIL> } #157999.13
NAME-EMAIL: {<DASH> <NAME-EMAIL> <NN>?} #157999.14
COPYRIGHT: {<COPYRIGHT2> <FOLLOWING> <AUTHS> <NAME-EMAIL>+ } #157999.14
#######################################
# Copyright is held by ....
#######################################
Expand Down Expand Up @@ -3412,11 +3465,11 @@ def build_detection_from_node(
COPYRIGHT: {<COMPANY><COPY>+<ALLRIGHTRESERVED>} #99900
COPYRIGHT: {<COPYRIGHT|COPYRIGHT2|COPY|NAME-COPY> <COPY|NNP|AUTHDOT|CAPS|CD|YR-RANGE|NAME|NAME-EMAIL|NAME-YEAR|NAME-COPY|NAME-CAPS|AUTHORANDCO|COMPANY|YEAR|PN|COMP|UNI|CC|OF|IN|BY|OTH|VAN|URL|EMAIL|URL2|MIXEDCAP|NN>+ <ALLRIGHTRESERVED>} #99999
COPYRIGHT: {<COPYRIGHT|COPYRIGHT2|COPY|NAME-COPY> <COPY|NNP|AUTHDOT|CAPS|CD|CDS|YR-RANGE|NAME|NAME-EMAIL|NAME-YEAR|NAME-COPY|NAME-CAPS|AUTHORANDCO|COMPANY|YEAR|PN|COMP|UNI|CC|OF|IN|BY|OTH|VAN|URL|EMAIL|URL2|MIXEDCAP|NN>+ <ALLRIGHTRESERVED>} #99999
# * Copyright (C) 2004 Red Hat, Inc.
# * Copyright (C) 200 Matthias Clasen <[email protected]>
COPYRIGHT: {<COPY> <COPY> <CD> <NAME-EMAIL>} #9999970
COPYRIGHT: {<COPY> <COPY> <CD|CDS> <NAME-EMAIL>} #9999970
# <p class="copyright"><a href="http://www.w3.org/Consortium/Legal/ipr-notice-20000612#Copyright">Copyright</a>
COPYRIGHT: {<COPYRIGHT> <COPY>} #9999980
Expand Down Expand Up @@ -3803,6 +3856,8 @@ def is_junk_copyright(s, patterns=COPYRIGHTS_JUNK_PATTERN_MATCHERS):
'$',
'current.year',
"©",
'author',
'authors',
])
))

Expand Down
4 changes: 1 addition & 3 deletions src/licensedcode/data/licenses/array-input-method-pl.LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ ignorable_copyrights:
- copyright holder of Array Input Method
ignorable_holders:
- Array Input Method
ignorable_authors:
- Array Input
---

Array Input Method Public License
Expand Down Expand Up @@ -80,4 +78,4 @@ or other liability obligations and/or rights consistent with this License. Howev
obligations, licensee may act only on his own behalf and on his sole responsibility, not on behalf of
anyone else, and only if the licensee agrees toindemnify, defend, and hold everyone else harmless
for any liability incurred by, or claims asserted against, such everyone else by reason of licensee's
accepting any such warranty or additional liability.
accepting any such warranty or additional liability.
6 changes: 3 additions & 3 deletions src/licensedcode/data/licenses/wxwidgets.LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ text_urls:
- http://www.wxwidgets.org/about/licence.htm
ignorable_copyrights:
- Copyright (c) 1997 Julian Smart, Markus Holzem
- copyrighted by the wxWidgets
- copyrighted by the wxWidgets authors
ignorable_holders:
- Julian Smart, Markus Holzem
- the wxWidgets
- the wxWidgets authors
ignorable_emails:
- [email protected]
---
Expand Down Expand Up @@ -238,4 +238,4 @@ library for tweaking knobs) written by James Random Hacker.

<signature of Ty Coon>, 1 April 1990

Ty Coon, President of Vice
Ty Coon, President of Vice
3 changes: 1 addition & 2 deletions src/licensedcode/data/rules/apache-1.1_114.RULE
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ ignorable_holders:
- Leo Galambos
ignorable_authors:
- the Egothor Project
- the Egothor Project. Under
ignorable_urls:
- http://egothor.sf.net/
ignorable_emails:
Expand Down Expand Up @@ -71,4 +70,4 @@ NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
OF THE POSSIBILITY OF SUCH DAMAGE.
OF THE POSSIBILITY OF SUCH DAMAGE.
2 changes: 1 addition & 1 deletion tests/cluecode/data/copyrights/misco2/mmiv.txt.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ what:
- holders_summary
- authors
copyrights:
- Copyright (c) MMIV-MMV Anselm R. Garbe
- Copyright (c) MMIV-MMV Anselm R. Garbe garbeam at gmail dot com
holders:
- MMIV-MMV Anselm R. Garbe
holders_summary:
Expand Down
1 change: 1 addition & 0 deletions tests/cluecode/data/copyrights/misco4/more-linux/aan.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Copyright 2016 Tom aan de Wiel
8 changes: 8 additions & 0 deletions tests/cluecode/data/copyrights/misco4/more-linux/aan.txt.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
what:
- copyrights
- holders
- authors
copyrights:
- Copyright 2016 Tom aan de Wiel
holders:
- Tom aan de Wiel
3 changes: 3 additions & 0 deletions tests/cluecode/data/copyrights/misco4/more-linux/auth-nl.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
* Copyright (C) 2016-2018
* Author: Matt Ranostay <[email protected]>

Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
what:
- copyrights
- holders
- authors
copyrights:
- Copyright (c) 2016-2018 Author Matt Ranostay <[email protected]>
holders:
- Matt Ranostay
1 change: 1 addition & 0 deletions tests/cluecode/data/copyrights/misco4/more-linux/ben.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Copyright 2010 Ben Dooks <ben-linux <at> fluff.org>
Loading

0 comments on commit 498467c

Please sign in to comment.