Skip to content

Commit

Permalink
Fix choking license detection post-processing #3245
Browse files Browse the repository at this point in the history
We were iterating over license detections, which was taking forever
to complete and this approach uses a dict/hashmap instead which
fixes the issue here.

Reference: #3245
Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
Reported-by: Philippe Ombredanne <[email protected]>
  • Loading branch information
AyanSinhaMahapatra committed Feb 13, 2023
1 parent 6358a4b commit f21c02d
Showing 1 changed file with 28 additions and 17 deletions.
45 changes: 28 additions & 17 deletions src/licensedcode/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@
from collections import Counter

import attr
import click
from license_expression import combine_expressions
from license_expression import Licensing
from time import time

This comment has been minimized.

Copy link
@pombredanne

pombredanne Feb 13, 2023

Member

Time doe not look like it is used anywhere.

This comment has been minimized.

Copy link
@AyanSinhaMahapatra

AyanSinhaMahapatra Feb 13, 2023

Author Member

oops these were being used for the click.echo() and time() statements, removing these.


from commoncode.resource import clean_path
from commoncode.text import python_safe_name
Expand Down Expand Up @@ -595,33 +597,25 @@ def get_unique_detections(cls, license_detections):
Return all unique UniqueDetection from a ``license_detections`` list of
LicenseDetection.
"""
identifiers = get_identifiers(license_detections)
unique_detection_counts = dict(Counter(identifiers))

detections_by_id = get_detections_by_id(license_detections)
unique_license_detections = []
for detection_identifier in unique_detection_counts.keys():
file_regions = (

for all_detections in detections_by_id.values():
file_regions = [
detection.file_region
for detection in license_detections
if detection_identifier == detection.identifier
)
all_detections = (
detection
for detection in license_detections
if detection_identifier == detection.identifier
)
for detection in all_detections
]

detection = next(all_detections)
detection = next(iter(all_detections))
detection_mapping = detection.to_dict()
files = list(file_regions)
unique_license_detections.append(
cls(
identifier=detection.identifier_with_expression,
license_expression=detection_mapping["license_expression"],
detection_log=detection_mapping["detection_log"],
matches=detection_mapping["matches"],
count=len(files),
files=files,
count=len(file_regions),
files=file_regions,
)
)

Expand All @@ -638,6 +632,23 @@ def dict_fields(attr, value):
return attr.asdict(self, filter=dict_fields)


def get_detections_by_id(license_detections):
"""
Get a dict(hashmap) where each item is: {detection.identifier: all_detections} where
`all_detections` is all detections in `license_detections` whose detection.identifier
is the same.
"""
detections_by_id = {}

for detection in license_detections:
detection_id = detection.identifier
if detection_id in detections_by_id:
detections_by_id[detection_id].append(detection)
else:
detections_by_id[detection_id] = [detection]

return detections_by_id

def get_identifiers(license_detections):
"""
Return identifiers for all ``license detections``.
Expand Down

0 comments on commit f21c02d

Please sign in to comment.