diff --git a/src/licensedcode/detection.py b/src/licensedcode/detection.py index 0391d1f8200..889c3ccdd10 100644 --- a/src/licensedcode/detection.py +++ b/src/licensedcode/detection.py @@ -17,8 +17,10 @@ from collections import Counter import attr +import click from license_expression import combine_expressions from license_expression import Licensing +from time import time from commoncode.resource import clean_path from commoncode.text import python_safe_name @@ -595,33 +597,25 @@ def get_unique_detections(cls, license_detections): Return all unique UniqueDetection from a ``license_detections`` list of LicenseDetection. """ - identifiers = get_identifiers(license_detections) - unique_detection_counts = dict(Counter(identifiers)) - + detections_by_id = get_detections_by_id(license_detections) unique_license_detections = [] - for detection_identifier in unique_detection_counts.keys(): - file_regions = ( + + for all_detections in detections_by_id.values(): + file_regions = [ detection.file_region - for detection in license_detections - if detection_identifier == detection.identifier - ) - all_detections = ( - detection - for detection in license_detections - if detection_identifier == detection.identifier - ) + for detection in all_detections + ] - detection = next(all_detections) + detection = next(iter(all_detections)) detection_mapping = detection.to_dict() - files = list(file_regions) unique_license_detections.append( cls( identifier=detection.identifier_with_expression, license_expression=detection_mapping["license_expression"], detection_log=detection_mapping["detection_log"], matches=detection_mapping["matches"], - count=len(files), - files=files, + count=len(file_regions), + files=file_regions, ) ) @@ -638,6 +632,23 @@ def dict_fields(attr, value): return attr.asdict(self, filter=dict_fields) +def get_detections_by_id(license_detections): + """ + Get a dict(hashmap) where each item is: {detection.identifier: all_detections} where + `all_detections` is all detections in `license_detections` whose detection.identifier + is the same. + """ + detections_by_id = {} + + for detection in license_detections: + detection_id = detection.identifier + if detection_id in detections_by_id: + detections_by_id[detection_id].append(detection) + else: + detections_by_id[detection_id] = [detection] + + return detections_by_id + def get_identifiers(license_detections): """ Return identifiers for all ``license detections``.