Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 105 additions & 35 deletions src/packagedcode/maven.py
Original file line number Diff line number Diff line change
Expand Up @@ -766,13 +766,14 @@ def _get_comments(self, xml=None):
def _find_licenses(self):
"""Return an iterable of license mappings."""
for lic in self.pom_data.findall('licenses/license'):
yield dict([
('name', self._get_attribute('name', lic)),
('url', self._get_attribute('url', lic)),
('comments', self._get_attribute('comments', lic)),
# arcane and seldom used
('distribution', self._get_attribute('distribution', lic)),
])
yield {"license": dict([
('name', self._get_attribute('name', lic)),
('url', self._get_attribute('url', lic)),
('comments', self._get_attribute('comments', lic)),
# arcane and seldom used
('distribution', self._get_attribute('distribution', lic)),
])
}

def _find_parties(self, key='developers/developer'):
"""Return an iterable of party mappings for a given xpath."""
Expand Down Expand Up @@ -1254,7 +1255,7 @@ def _parse(
# complex defeinition in Maven
qualifiers['type'] = extension

extracted_license_statement = pom.licenses
extracted_license_statement = clean_licenses(pom.licenses) or None

group_id = pom.group_id
artifact_id = pom.artifact_id
Expand Down Expand Up @@ -1325,52 +1326,121 @@ def get_license_detections_for_extracted_license_statement(
approximate=True,
expression_symbols=None,
):
"""
Return license detections from a Maven POM license data structure.
This looks like this in XML, and some attributes are more important than others.
Which one exists and whether we can detect a proper license in each also determines which
attribute we need to consider.
The original XML has this shape:
<licenses>
<license>
<name>Apache-2.0</name>
<url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
<distribution>repo</distribution>
<comments> notes... </comments>
</license>
</licenses>
The data structure we keep has this shape:
[{"license":
{
"name": "Apache-2.0",
"url": "https://www.apache.org/licenses/LICENSE-2.0.txt",
"comments": "notes...",
}
},
.... other license]
"""

from packagedcode.licensing import get_normalized_license_detections
from packagedcode.licensing import get_license_detections_for_extracted_license_statement

if not cls.check_extracted_license_statement_structure(extracted_license):
if not is_standard_maven_license_data_structure(licenses=extracted_license):
# use the generic detection
return get_normalized_license_detections(
extracted_license=extracted_license,
try_as_expression=try_as_expression,
approximate=approximate,
expression_symbols=expression_symbols,
)
extracted_license = clean_licenses(extracted_license)
extracted_license_statement = saneyaml.dump(extracted_license)

new_extracted_license = extracted_license.copy()

for license_entry in new_extracted_license:
license_entry.pop("distribution")
if not license_entry.get("name"):
license_entry.pop("name")
if not license_entry.get("url"):
license_entry.pop("url")
if not license_entry.get("comments"):
license_entry.pop("comments")

extracted_license_statement = saneyaml.dump(new_extracted_license)

return get_license_detections_for_extracted_license_statement(
detections = get_license_detections_for_extracted_license_statement(
extracted_license_statement=extracted_license_statement,
try_as_expression=try_as_expression,
approximate=approximate,
expression_symbols=expression_symbols,
)
# TODO: if we have any unknown license, we need to try harder
# We can detect each license item individually and check if the unknown was detected
# in the name, URL or comment field.
# name, URL, comments
# name unknwon: keep that unknown in all cases
# URL or comments with unknown, but name not unknown: we want to combine the unknown
# matches with the correct name match

@classmethod
def check_extracted_license_statement_structure(cls, extracted_license):
return detections

is_list_of_mappings = False
if not isinstance(extracted_license, list):
return is_list_of_mappings
else:
is_list_of_mappings = True

for extracted_license_item in extracted_license:
if not isinstance(extracted_license_item, dict):
is_list_of_mappings = False
break
def clean_licenses(licenses):
"""
Return a modified, cleaned ``licenses`` list of POM license data cleaned from unwanted data
(some fields, empty entries, etc).
Each item in the list has this shape:
[
{"license": {"name": "Apache-2.0", "url": "https://www... ", "comments": "..."} },
{"license": {other fields} },
]
"""
for licitem in (licenses or []):
if not isinstance(licitem, dict):
continue

license_attributes = licitem.get("license")
if not license_attributes or not len(licitem) == 1:
continue

license_attributes.pop("distribution", None)
if not license_attributes.get("name"):
license_attributes.pop("name", None)
if not license_attributes.get("url"):
license_attributes.pop("url", None)
if not license_attributes.get("comments"):
license_attributes.pop("comments", None)

return licenses

return is_list_of_mappings

def is_standard_maven_license_data_structure(licenses):
"""
Return True if ``licenses`` has the structure expected from a Maven POM license data. The data
is a list of dicts of dicts, each top dict with a single item as {"license" : {mapping of
attributes}. We expect the POM license data to be in that shape in most cases, except for legacy
non POM 4 data.
Each item in the list has this shape:
[
{"license": {"name": "Apache-2.0", "url": "https://www... ", "comments": "..."} },
{"license": {other fields} },
]

"""
if not isinstance(licenses, list):
return False

fields = ("name", "url", "comment",)

for item in licenses:
if not isinstance(item, dict):
return False
if not len(item) == 1:
return False
litem = item.get('license') or {}
if not isinstance(litem, dict):
return False
if not any(field in item for field in fields):
return False

return True


def build_vcs_and_code_view_urls(scm):
Expand Down
8 changes: 4 additions & 4 deletions tests/formattedcode/data/common/manifests-expected.json
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
"rule_relevance": 100,
"rule_identifier": "cddl-1.0_98.RULE",
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0_98.RULE",
"matched_text": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html"
"matched_text": "name: Common Development and Distribution License (CDDL) v1.0\nurl: http://www.sun.com/cddl/cddl.html"
}
],
"identifier": "cddl_1_0-b17acf03-1e4f-20e6-cbb8-1b6945ee4c4c"
Expand All @@ -53,7 +53,7 @@
"other_license_expression": null,
"other_license_expression_spdx": null,
"other_license_detections": [],
"extracted_license_statement": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
"extracted_license_statement": "- license:\n name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
"notice_text": null,
"source_packages": [
"pkg:maven/javax.persistence/[email protected]?classifier=sources"
Expand Down Expand Up @@ -1142,7 +1142,7 @@
"rule_relevance": 100,
"rule_identifier": "cddl-1.0_98.RULE",
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0_98.RULE",
"matched_text": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html"
"matched_text": "name: Common Development and Distribution License (CDDL) v1.0\nurl: http://www.sun.com/cddl/cddl.html"
}
],
"identifier": "cddl_1_0-b17acf03-1e4f-20e6-cbb8-1b6945ee4c4c"
Expand All @@ -1151,7 +1151,7 @@
"other_license_expression": null,
"other_license_expression_spdx": null,
"other_license_detections": [],
"extracted_license_statement": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
"extracted_license_statement": "- license:\n name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
"notice_text": null,
"source_packages": [
"pkg:maven/javax.persistence/[email protected]?classifier=sources"
Expand Down
8 changes: 4 additions & 4 deletions tests/formattedcode/data/common/manifests-expected.jsonlines
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
"rule_relevance": 100,
"rule_identifier": "cddl-1.0_98.RULE",
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0_98.RULE",
"matched_text": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html"
"matched_text": "name: Common Development and Distribution License (CDDL) v1.0\nurl: http://www.sun.com/cddl/cddl.html"
}
],
"identifier": "cddl_1_0-b17acf03-1e4f-20e6-cbb8-1b6945ee4c4c"
Expand All @@ -85,7 +85,7 @@
"other_license_expression": null,
"other_license_expression_spdx": null,
"other_license_detections": [],
"extracted_license_statement": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
"extracted_license_statement": "- license:\n name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
"notice_text": null,
"source_packages": [
"pkg:maven/javax.persistence/[email protected]?classifier=sources"
Expand Down Expand Up @@ -1188,7 +1188,7 @@
"rule_relevance": 100,
"rule_identifier": "cddl-1.0_98.RULE",
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0_98.RULE",
"matched_text": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html"
"matched_text": "name: Common Development and Distribution License (CDDL) v1.0\nurl: http://www.sun.com/cddl/cddl.html"
}
],
"identifier": "cddl_1_0-b17acf03-1e4f-20e6-cbb8-1b6945ee4c4c"
Expand All @@ -1197,7 +1197,7 @@
"other_license_expression": null,
"other_license_expression_spdx": null,
"other_license_detections": [],
"extracted_license_statement": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
"extracted_license_statement": "- license:\n name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
"notice_text": null,
"source_packages": [
"pkg:maven/javax.persistence/[email protected]?classifier=sources"
Expand Down
22 changes: 12 additions & 10 deletions tests/formattedcode/data/common/manifests-expected.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -105,15 +105,16 @@ packages:
rule_identifier: cddl-1.0_98.RULE
rule_url: https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0_98.RULE
matched_text: |
- name: Common Development and Distribution License (CDDL) v1.0
url: http://www.sun.com/cddl/cddl.html
name: Common Development and Distribution License (CDDL) v1.0
url: http://www.sun.com/cddl/cddl.html
identifier: cddl_1_0-b17acf03-1e4f-20e6-cbb8-1b6945ee4c4c
other_license_expression:
other_license_expression_spdx:
other_license_detections: []
extracted_license_statement: |
- name: Common Development and Distribution License (CDDL) v1.0
url: http://www.sun.com/cddl/cddl.html
- license:
name: Common Development and Distribution License (CDDL) v1.0
url: http://www.sun.com/cddl/cddl.html
notice_text:
source_packages:
- pkg:maven/javax.persistence/[email protected]?classifier=sources
Expand Down Expand Up @@ -818,8 +819,8 @@ license_detections:
rule_identifier: cddl-1.0_98.RULE
rule_url: https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0_98.RULE
matched_text: |
- name: Common Development and Distribution License (CDDL) v1.0
url: http://www.sun.com/cddl/cddl.html
name: Common Development and Distribution License (CDDL) v1.0
url: http://www.sun.com/cddl/cddl.html
- identifier: lgpl_3_0-272571eb-5e68-95b6-ddb0-71de2d8df321
license_expression: lgpl-3.0
license_expression_spdx: LGPL-3.0-only
Expand Down Expand Up @@ -2006,15 +2007,16 @@ files:
rule_identifier: cddl-1.0_98.RULE
rule_url: https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0_98.RULE
matched_text: |
- name: Common Development and Distribution License (CDDL) v1.0
url: http://www.sun.com/cddl/cddl.html
name: Common Development and Distribution License (CDDL) v1.0
url: http://www.sun.com/cddl/cddl.html
identifier: cddl_1_0-b17acf03-1e4f-20e6-cbb8-1b6945ee4c4c
other_license_expression:
other_license_expression_spdx:
other_license_detections: []
extracted_license_statement: |
- name: Common Development and Distribution License (CDDL) v1.0
url: http://www.sun.com/cddl/cddl.html
- license:
name: Common Development and Distribution License (CDDL) v1.0
url: http://www.sun.com/cddl/cddl.html
notice_text:
source_packages:
- pkg:maven/javax.persistence/[email protected]?classifier=sources
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@
"organization_url": null,
"licenses": [
{
"name": "Public Domain",
"url": null,
"comments": null,
"distribution": null
"license": {
"name": "Public Domain",
"url": null,
"comments": null,
"distribution": null
}
}
],
"developers": [],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
"rule_relevance": 70,
"rule_identifier": "public-domain_bare_words.RULE",
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/public-domain_bare_words.RULE",
"matched_text": "- name: Public Domain"
"matched_text": "name: Public Domain"
}
],
"identifier": "public_domain-3dd945ae-65df-7d90-6467-60f8ecf2eb77"
Expand All @@ -51,7 +51,7 @@
"other_license_expression": null,
"other_license_expression_spdx": null,
"other_license_detections": [],
"extracted_license_statement": "- name: Public Domain\n",
"extracted_license_statement": "- license:\n name: Public Domain\n",
"notice_text": null,
"source_packages": [
"pkg:maven/aopalliance/[email protected]?classifier=sources"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@
"organization_url": null,
"licenses": [
{
"name": "Eclipse Public License - v 1.0",
"url": "http://www.eclipse.org/legal/epl-v10.html",
"comments": null,
"distribution": "repo"
"license": {
"name": "Eclipse Public License - v 1.0",
"url": "http://www.eclipse.org/legal/epl-v10.html",
"comments": null,
"distribution": "repo"
}
}
],
"developers": [],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
"rule_relevance": 100,
"rule_identifier": "epl-1.0_4.RULE",
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/epl-1.0_4.RULE",
"matched_text": "- name: Eclipse Public License - v 1.0"
"matched_text": "name: Eclipse Public License - v 1.0"
},
{
"license_expression": "epl-1.0",
Expand All @@ -57,7 +57,7 @@
"rule_relevance": 100,
"rule_identifier": "epl-1.0.RULE",
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/epl-1.0.RULE",
"matched_text": " url: http://www.eclipse.org/legal/epl-v10.html"
"matched_text": "url: http://www.eclipse.org/legal/epl-v10.html"
}
],
"identifier": "epl_1_0-48d15cd3-0ccf-4f62-3d30-24dc9b7308e5"
Expand All @@ -66,7 +66,7 @@
"other_license_expression": null,
"other_license_expression_spdx": null,
"other_license_detections": [],
"extracted_license_statement": "- name: Eclipse Public License - v 1.0\n url: http://www.eclipse.org/legal/epl-v10.html\n",
"extracted_license_statement": "- license:\n name: Eclipse Public License - v 1.0\n url: http://www.eclipse.org/legal/epl-v10.html\n",
"notice_text": null,
"source_packages": [
"pkg:maven/aspectj/[email protected]?classifier=sources"
Expand Down
Loading
Loading