-
-
Notifications
You must be signed in to change notification settings - Fork 582
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix choking license detection post-processing #3245
We were iterating over license detections, which was taking forever to complete and this approach uses a dict/hashmap instead which fixes the issue here. Reference: #3245 Signed-off-by: Ayan Sinha Mahapatra <[email protected]> Reported-by: Philippe Ombredanne <[email protected]>
- Loading branch information
1 parent
6358a4b
commit f21c02d
Showing
1 changed file
with
28 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,8 +17,10 @@ | |
from collections import Counter | ||
|
||
import attr | ||
import click | ||
from license_expression import combine_expressions | ||
from license_expression import Licensing | ||
from time import time | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
AyanSinhaMahapatra
Author
Member
|
||
|
||
from commoncode.resource import clean_path | ||
from commoncode.text import python_safe_name | ||
|
@@ -595,33 +597,25 @@ def get_unique_detections(cls, license_detections): | |
Return all unique UniqueDetection from a ``license_detections`` list of | ||
LicenseDetection. | ||
""" | ||
identifiers = get_identifiers(license_detections) | ||
unique_detection_counts = dict(Counter(identifiers)) | ||
|
||
detections_by_id = get_detections_by_id(license_detections) | ||
unique_license_detections = [] | ||
for detection_identifier in unique_detection_counts.keys(): | ||
file_regions = ( | ||
|
||
for all_detections in detections_by_id.values(): | ||
file_regions = [ | ||
detection.file_region | ||
for detection in license_detections | ||
if detection_identifier == detection.identifier | ||
) | ||
all_detections = ( | ||
detection | ||
for detection in license_detections | ||
if detection_identifier == detection.identifier | ||
) | ||
for detection in all_detections | ||
] | ||
|
||
detection = next(all_detections) | ||
detection = next(iter(all_detections)) | ||
detection_mapping = detection.to_dict() | ||
files = list(file_regions) | ||
unique_license_detections.append( | ||
cls( | ||
identifier=detection.identifier_with_expression, | ||
license_expression=detection_mapping["license_expression"], | ||
detection_log=detection_mapping["detection_log"], | ||
matches=detection_mapping["matches"], | ||
count=len(files), | ||
files=files, | ||
count=len(file_regions), | ||
files=file_regions, | ||
) | ||
) | ||
|
||
|
@@ -638,6 +632,23 @@ def dict_fields(attr, value): | |
return attr.asdict(self, filter=dict_fields) | ||
|
||
|
||
def get_detections_by_id(license_detections): | ||
""" | ||
Get a dict(hashmap) where each item is: {detection.identifier: all_detections} where | ||
`all_detections` is all detections in `license_detections` whose detection.identifier | ||
is the same. | ||
""" | ||
detections_by_id = {} | ||
|
||
for detection in license_detections: | ||
detection_id = detection.identifier | ||
if detection_id in detections_by_id: | ||
detections_by_id[detection_id].append(detection) | ||
else: | ||
detections_by_id[detection_id] = [detection] | ||
|
||
return detections_by_id | ||
|
||
def get_identifiers(license_detections): | ||
""" | ||
Return identifiers for all ``license detections``. | ||
|
Time doe not look like it is used anywhere.