diff --git a/src/macaron/artifact/maven.py b/src/macaron/artifact/maven.py index 711758c8c..6130aed83 100644 --- a/src/macaron/artifact/maven.py +++ b/src/macaron/artifact/maven.py @@ -2,7 +2,7 @@ # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module declares types and utilities for Maven artifacts.""" - +import re from collections.abc import Sequence from packageurl import PackageURL @@ -140,3 +140,21 @@ def create_maven_purl_from_artifact_filename( ) return None + + +def is_valid_maven_group_id(group_id: str) -> bool: + """Check if the provided string is a valid maven group id. + + Parameters + ---------- + group_id : str + The group id to check. + + Returns + ------- + bool + True if the group id is valid, False otherwise + """ + # Should match strings like org.example.foo, org.example-2.foo.bar_1. + pattern = r"^[a-zA-Z][a-zA-Z0-9-]*\.([a-zA-Z][a-zA-Z0-9-]*\.)*[a-zA-Z][a-zA-Z0-9-]*[a-zA-Z0-9]$" + return re.match(pattern, group_id) is not None diff --git a/src/macaron/repo_finder/repo_finder_deps_dev.py b/src/macaron/repo_finder/repo_finder_deps_dev.py index 7f2266051..468bf472e 100644 --- a/src/macaron/repo_finder/repo_finder_deps_dev.py +++ b/src/macaron/repo_finder/repo_finder_deps_dev.py @@ -5,6 +5,7 @@ import json import logging from enum import StrEnum +from typing import Any from urllib.parse import quote as encode from packageurl import PackageURL @@ -12,6 +13,7 @@ from macaron.json_tools import json_extract from macaron.repo_finder.repo_finder_base import BaseRepoFinder from macaron.repo_finder.repo_validator import find_valid_repository_url +from macaron.slsa_analyzer.git_url import clean_url from macaron.util import send_get_http_raw logger: logging.Logger = logging.getLogger(__name__) @@ -71,6 +73,41 @@ def find_repo(self, purl: PackageURL) -> str: return "" + @staticmethod + def get_project_info(project_url: str) -> dict[str, Any] | None: + """Retrieve project information from deps.dev. + + Parameters + ---------- + project_url : str + The URL of the project. + + Returns + ------- + dict[str, Any] | None + The project information or None if the information could not be retrieved. + """ + clean_repo_url = clean_url(project_url) + if clean_repo_url is None or clean_repo_url.hostname is None: + logger.debug("Invalid project url format: %s", project_url) + return None + + project_key = clean_repo_url.hostname + clean_repo_url.path + + request_url = f"https://api.deps.dev/v3alpha/projects/{encode(project_key, safe='')}" + response = send_get_http_raw(request_url) + if not (response and response.text): + logger.debug("Failed to retrieve additional repo info for: %s", project_url) + return None + + try: + response_json: dict = json.loads(response.text) + except ValueError as error: + logger.debug("Failed to parse response from deps.dev: %s", error) + return None + + return response_json + def _create_urls(self, purl: PackageURL) -> list[str]: """ Create the urls to search for the metadata relating to the passed artifact. diff --git a/src/macaron/repo_verifier/__init__.py b/src/macaron/repo_verifier/__init__.py new file mode 100644 index 000000000..727c3c37d --- /dev/null +++ b/src/macaron/repo_verifier/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This package contains classes for repository verification.""" diff --git a/src/macaron/repo_verifier/repo_verifier.py b/src/macaron/repo_verifier/repo_verifier.py new file mode 100644 index 000000000..534bae57d --- /dev/null +++ b/src/macaron/repo_verifier/repo_verifier.py @@ -0,0 +1,75 @@ +# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This module contains code to verify whether a reported repository can be linked back to the artifact.""" +import logging + +from macaron.repo_verifier.repo_verifier_base import ( + RepositoryVerificationResult, + RepositoryVerificationStatus, + RepoVerifierBase, +) +from macaron.repo_verifier.repo_verifier_gradle import RepoVerifierGradle +from macaron.repo_verifier.repo_verifier_maven import RepoVerifierMaven +from macaron.slsa_analyzer.build_tool import BaseBuildTool, Gradle, Maven + +logger = logging.getLogger(__name__) + + +def verify_repo( + namespace: str | None, + name: str, + version: str, + reported_repo_url: str, + reported_repo_fs: str, + build_tool: BaseBuildTool, +) -> RepositoryVerificationResult: + """Verify whether the repository links back to the artifact. + + Parameters + ---------- + namespace : str | None + The namespace of the artifact. + name : str + The name of the artifact. + version : str + The version of the artifact. + reported_repo_url : str + The reported repository URL. + reported_repo_fs : str + The reported repository filesystem path. + build_tool : BaseBuildTool + The build tool used to build the package. + + Returns + ------- + RepositoryVerificationResult + The result of the repository verification + """ + # TODO: Add support for other build tools. + verifier_map: dict[type[BaseBuildTool], type[RepoVerifierBase]] = { + Maven: RepoVerifierMaven, + Gradle: RepoVerifierGradle, + # Poetry(): RepoVerifierPoetry, + # Pip(): RepoVerifierPip, + # Docker(): RepoVerifierDocker, + # NPM(): RepoVerifierNPM, + # Yarn(): RepoVerifierYarn, + # Go(): RepoVerifierGo, + } + + verifier_cls = verifier_map.get(type(build_tool)) + if not verifier_cls: + return RepositoryVerificationResult( + status=RepositoryVerificationStatus.UNKNOWN, reason="unsupported_type", build_tool=build_tool + ) + + verifier = verifier_cls( + namespace=namespace, + name=name, + version=version, + reported_repo_url=reported_repo_url, + reported_repo_fs=reported_repo_fs, + ) + + return verifier.verify_repo() diff --git a/src/macaron/repo_verifier/repo_verifier_base.py b/src/macaron/repo_verifier/repo_verifier_base.py new file mode 100644 index 000000000..1fee6a31c --- /dev/null +++ b/src/macaron/repo_verifier/repo_verifier_base.py @@ -0,0 +1,139 @@ +# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This module contains the base class and core data models for repository verification.""" +import abc +import logging +import os +from collections import deque +from dataclasses import dataclass +from enum import Enum +from pathlib import Path + +from macaron.slsa_analyzer.build_tool import BaseBuildTool + +logger = logging.getLogger(__name__) + + +def find_file_in_repo(root_dir: Path, filename: str) -> Path | None: + """Find the highest level file with a given name in a local repository. + + This function ignores certain paths that are not under the main source code directories. + + Parameters + ---------- + root_dir : Path + The root directory of the repository. + filename : str + The name of the file to search for. + + Returns + ------- + Path | None + The path to the file if it exists, otherwise + """ + # TODO: Consider using BaseBuildTool.get_build_dirs. + # + Refactor 'get_build_dirs' to skip certain directories + # that are most likely not part of the main codebase (e.g., sample). + # + Need to find a way to look for other + # files (e.g., gradle.properties) for the purpose of repo verification + # without breaking the current logic of finding build directories. + # + Add the capability to return the content/path of the file. + if not os.path.isdir(root_dir): + return None + + queue: deque[Path] = deque() + queue.append(Path(root_dir)) + while queue: + current_dir = queue.popleft() + + # Don't look through non-main directories. + if any( + keyword in current_dir.name.lower() + for keyword in ["test", "example", "sample", "doc", "demo", "spec", "mock"] + ): + continue + + if Path(current_dir, filename).exists(): + return Path(current_dir, filename) + + # Ignore symlinks to prevent potential infinite loop. + sub_dirs = [Path(it) for it in current_dir.iterdir() if it.is_dir() and not it.is_symlink()] + queue.extend(sub_dirs) + + return None + + +class RepositoryVerificationStatus(str, Enum): + """A class to store the status of the repo verification.""" + + #: We found evidence to prove that the repository can be linked back to the publisher of the artifact. + PASSED = "passed" + + #: We found evidence showing that the repository is not the publisher of the artifact. + FAILED = "failed" + + #: We could not find any evidence to prove or disprove that the repository can be linked back to the artifact. + UNKNOWN = "unknown" + + +@dataclass(frozen=True) +class RepositoryVerificationResult: + """A class to store the information about repository verification.""" + + #: The status of the repository verification. + status: RepositoryVerificationStatus + + #: The reason for the verification result. + reason: str + + #: The build tool used to build the package. + build_tool: BaseBuildTool + + +class RepoVerifierBase(abc.ABC): + """The base class to verify whether a reported repository links back to the artifact.""" + + @property + @abc.abstractmethod + def build_tool(self) -> BaseBuildTool: + """Define the build tool used to build the package.""" + + def __init__( + self, + namespace: str | None, + name: str, + version: str, + reported_repo_url: str, + reported_repo_fs: str, + ): + """Instantiate the class. + + Parameters + ---------- + namespace : str + The namespace of the artifact. + name : str + The name of the artifact. + version : str + The version of the artifact. + reported_repo_url : str + The URL of the repository reported by the publisher. + reported_repo_fs : str + The file system path of the reported repository. + """ + self.namespace = namespace + self.name = name + self.version = version + self.reported_repo_url = reported_repo_url + self.reported_repo_fs = reported_repo_fs + + @abc.abstractmethod + def verify_repo(self) -> RepositoryVerificationResult: + """Verify whether the repository links back to the artifact. + + Returns + ------- + RepositoryVerificationResult + The result of the repository verification + """ diff --git a/src/macaron/repo_verifier/repo_verifier_gradle.py b/src/macaron/repo_verifier/repo_verifier_gradle.py new file mode 100644 index 000000000..b31601ebf --- /dev/null +++ b/src/macaron/repo_verifier/repo_verifier_gradle.py @@ -0,0 +1,161 @@ +# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This module contains code to verify whether a repository with Gradle build system can be linked back to the artifact.""" +import logging +from pathlib import Path + +from macaron.artifact.maven import is_valid_maven_group_id +from macaron.repo_verifier.repo_verifier_base import ( + RepositoryVerificationResult, + RepositoryVerificationStatus, + RepoVerifierBase, + find_file_in_repo, +) +from macaron.repo_verifier.repo_verifier_maven import RepoVerifierMaven +from macaron.slsa_analyzer.build_tool import Gradle +from macaron.slsa_analyzer.package_registry.maven_central_registry import same_organization + +logger = logging.getLogger(__name__) + + +class RepoVerifierGradle(RepoVerifierBase): + """A class to verify whether a repository with Gradle build tool links back to the artifact.""" + + build_tool = Gradle() + + def __init__( + self, + namespace: str, + name: str, + version: str, + reported_repo_url: str, + reported_repo_fs: str, + ): + """Initialize a RepoVerifierGradle instance. + + Parameters + ---------- + namespace : str + The namespace of the artifact. + name : str + The name of the artifact. + version : str + The version of the artifact. + reported_repo_url : str + The URL of the repository reported by the publisher. + reported_repo_fs : str + The file system path of the reported repository. + """ + super().__init__(namespace, name, version, reported_repo_url, reported_repo_fs) + + self.maven_verifier = RepoVerifierMaven( + namespace=namespace, + name=name, + version=version, + reported_repo_url=reported_repo_url, + reported_repo_fs=reported_repo_fs, + ) + + def verify_repo(self) -> RepositoryVerificationResult: + """Verify whether the reported repository links back to the artifact. + + Returns + ------- + RepositoryVerificationResult + The result of the repository verification + """ + if not self.namespace: + logger.debug("No namespace provided for Gradle verification.") + return RepositoryVerificationResult( + status=RepositoryVerificationStatus.UNKNOWN, reason="no_namespace", build_tool=self.build_tool + ) + + recognized_services_verification_result = ( + self.maven_verifier.verify_domains_from_recognized_code_hosting_services() + ) + if recognized_services_verification_result.status == RepositoryVerificationStatus.PASSED: + return recognized_services_verification_result + + gradle_group_id = self._extract_group_id_from_properties() + if not gradle_group_id: + gradle_group_id = self._extract_group_id_from_build_groovy() + if not gradle_group_id: + gradle_group_id = self._extract_group_id_from_build_kotlin() + if not gradle_group_id: + logger.debug("Could not find group from gradle manifests for %s", self.reported_repo_url) + return RepositoryVerificationResult( + status=RepositoryVerificationStatus.UNKNOWN, + reason="no_group_in_gradle_manifest", + build_tool=self.build_tool, + ) + + if not same_organization(gradle_group_id, self.namespace): + logger.debug("Group in gradle manifest does not match the provided group id: %s", self.reported_repo_url) + return RepositoryVerificationResult( + status=RepositoryVerificationStatus.FAILED, reason="group_id_mismatch", build_tool=self.build_tool + ) + + return RepositoryVerificationResult( + status=RepositoryVerificationStatus.PASSED, reason="group_id_match", build_tool=self.build_tool + ) + + def _extract_group_id_from_gradle_manifest( + self, file_path: Path | None, quote_chars: set[str] | None = None, delimiter: str = "=" + ) -> str | None: + """Extract the group id from a gradle build or config file. + + Parameters + ---------- + file_path : Path | None + The path to the file. + quote_chars : set[str] | None + The characters used to quote the group id. + delimiter : str + The delimiter used in the file. + + Returns + ------- + str | None + The extracted group id. None if not found. + """ + if not file_path: + logger.debug("Could not find the file %s in the repository: %s", file_path, self.reported_repo_url) + return None + + file_content = file_path.read_text().splitlines() + for line in file_content: + line_parts = list(filter(None, map(str.strip, line.strip().lower().split(delimiter)))) + if len(line_parts) != 2: + continue + + if line_parts[0] != "group": + continue + + group_id = line_parts[1] + + # Check if the value for group_id is a string literal. + if quote_chars: + if group_id[0] not in quote_chars or group_id[-1] not in quote_chars or group_id[0] != group_id[-1]: + continue + group_id = group_id[1:-1] + + if is_valid_maven_group_id(group_id): + return group_id + + return None + + def _extract_group_id_from_properties(self) -> str | None: + """Extract the group id from the gradle.properties file.""" + gradle_properties = find_file_in_repo(Path(self.reported_repo_fs), "gradle.properties") + return self._extract_group_id_from_gradle_manifest(gradle_properties) + + def _extract_group_id_from_build_groovy(self) -> str | None: + """Extract the group id from the build.gradle file.""" + build_gradle = find_file_in_repo(Path(self.reported_repo_fs), "build.gradle") + return self._extract_group_id_from_gradle_manifest(build_gradle, quote_chars={"'", '"'}, delimiter=" ") + + def _extract_group_id_from_build_kotlin(self) -> str | None: + """Extract the group id from the build.gradle.kts file.""" + build_gradle = find_file_in_repo(Path(self.reported_repo_fs), "build.gradle.kts") + return self._extract_group_id_from_gradle_manifest(build_gradle, quote_chars={'"'}, delimiter="=") diff --git a/src/macaron/repo_verifier/repo_verifier_maven.py b/src/macaron/repo_verifier/repo_verifier_maven.py new file mode 100644 index 000000000..22c9e42b3 --- /dev/null +++ b/src/macaron/repo_verifier/repo_verifier_maven.py @@ -0,0 +1,133 @@ +# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This module contains code to verify whether a reported repository with Maven build system can be linked back to the artifact.""" +import logging +from pathlib import Path +from urllib.parse import urlparse + +from macaron.parsers.pomparser import parse_pom_string +from macaron.repo_verifier.repo_verifier_base import ( + RepositoryVerificationResult, + RepositoryVerificationStatus, + RepoVerifierBase, + find_file_in_repo, +) +from macaron.slsa_analyzer.build_tool import Maven +from macaron.slsa_analyzer.package_registry.maven_central_registry import ( + RECOGNIZED_CODE_HOSTING_SERVICES, + same_organization, +) + +logger = logging.getLogger(__name__) + + +class RepoVerifierMaven(RepoVerifierBase): + """A class to verify whether a repository with Maven build tool links back to the artifact.""" + + build_tool = Maven() + + def verify_repo(self) -> RepositoryVerificationResult: + """Verify whether the reported repository links back to the Maven artifact. + + Returns + ------- + RepositoryVerificationResult + The result of the repository verification + """ + if not self.namespace: + logger.debug("No namespace provided for Maven verification.") + return RepositoryVerificationResult( + status=RepositoryVerificationStatus.UNKNOWN, reason="no_namespace", build_tool=self.build_tool + ) + + recognized_services_verification_result = self.verify_domains_from_recognized_code_hosting_services() + if recognized_services_verification_result.status == RepositoryVerificationStatus.PASSED: + return recognized_services_verification_result + + # TODO: check other pom files. Think about how to decide in case of contradicting evidence. + # Check if repo contains pom.xml. + pom_file = find_file_in_repo(Path(self.reported_repo_fs), "pom.xml") + if not pom_file: + logger.debug("Could not find any pom.xml in the repository: %s", self.reported_repo_url) + return RepositoryVerificationResult( + status=RepositoryVerificationStatus.UNKNOWN, reason="no_pom", build_tool=self.build_tool + ) + + pom_content = pom_file.read_text(encoding="utf-8") + pom_root = parse_pom_string(pom_content) + + if not pom_root: + logger.debug("Could not parse pom.xml: %s", pom_file.as_posix()) + return RepositoryVerificationResult( + status=RepositoryVerificationStatus.UNKNOWN, reason="not_parsed_pom", build_tool=self.build_tool + ) + + # Find the group id in the pom (project/groupId). + # The closing curly brace represents the end of the XML namespace. + pom_group_id_elem = next((ch for ch in pom_root if ch.tag.endswith("}groupId")), None) + if pom_group_id_elem is None or not pom_group_id_elem.text: + logger.debug("Could not find groupId in pom.xml: %s", pom_file) + return RepositoryVerificationResult( + status=RepositoryVerificationStatus.UNKNOWN, reason="no_group_id_in_pom", build_tool=self.build_tool + ) + + pom_group_id = pom_group_id_elem.text.strip() + if not same_organization(pom_group_id, self.namespace): + logger.debug("Group id in pom.xml does not match the provided group id: %s", pom_file) + return RepositoryVerificationResult( + status=RepositoryVerificationStatus.FAILED, reason="group_id_mismatch", build_tool=self.build_tool + ) + + return RepositoryVerificationResult( + status=RepositoryVerificationStatus.PASSED, reason="group_id_match", build_tool=self.build_tool + ) + + def verify_domains_from_recognized_code_hosting_services(self) -> RepositoryVerificationResult: + """Verify repository link by comparing the maven domain name and the account on code hosting services. + + This verification relies on the fact that Sonatype recognizes + certain code hosting platforms for namespace verification on maven central. + + Returns + ------- + RepositoryVerificationResult + The result of the repository verification + """ + if not self.namespace: + return RepositoryVerificationResult( + status=RepositoryVerificationStatus.UNKNOWN, reason="no_namespace", build_tool=self.build_tool + ) + + parsed_url = urlparse(self.reported_repo_url) + if parsed_url is None or not parsed_url.hostname: + logger.debug("Could not parse the claimed repository URL: %s", self.reported_repo_url) + return RepositoryVerificationResult( + status=RepositoryVerificationStatus.UNKNOWN, reason="url_parse_error", build_tool=self.build_tool + ) + + reported_hostname = parsed_url.hostname.split(".")[0] + reported_account = parsed_url.path.strip("/").split("/")[0] + + group_parts = self.namespace.split(".") + for platform in RECOGNIZED_CODE_HOSTING_SERVICES: + # For artifacts from recognized code hosting services, check if the + # organization name is the same in maven and the source repository. + # For example, com.github.foo matches github.com/foo, + # but it doesn't match gitlab.com/foo or gitlab.com/bar. + if ( + group_parts[0].lower() in {"io", "com"} + and group_parts[1].lower() == platform.lower() # e.g., github + and group_parts[1].lower() == reported_hostname.lower() # e.g., github + and group_parts[2].lower() == reported_account.lower() # e.g., foo in github.com/foo + ): + return RepositoryVerificationResult( + status=RepositoryVerificationStatus.PASSED, reason="git_ns_match", build_tool=self.build_tool + ) + + return RepositoryVerificationResult( + # Not necessarily a fail, because many projects use maven group ids other than their repo domain. + status=RepositoryVerificationStatus.UNKNOWN, + reason="git_ns_mismatch", + build_tool=self.build_tool, + ) diff --git a/src/macaron/slsa_analyzer/analyze_context.py b/src/macaron/slsa_analyzer/analyze_context.py index e54363f98..1f00df010 100644 --- a/src/macaron/slsa_analyzer/analyze_context.py +++ b/src/macaron/slsa_analyzer/analyze_context.py @@ -12,6 +12,7 @@ from typing import Any, TypedDict from macaron.database.table_definitions import Component, SLSALevel +from macaron.repo_verifier.repo_verifier import RepositoryVerificationResult from macaron.slsa_analyzer.checks.check_result import CheckResult, CheckResultType from macaron.slsa_analyzer.ci_service.base_ci_service import BaseCIService from macaron.slsa_analyzer.git_service import BaseGitService @@ -34,6 +35,8 @@ class ChecksOutputs(TypedDict): git_service: BaseGitService """The git service information for the target software component.""" + repo_verification: list[RepositoryVerificationResult] + """The repository verification info.""" build_spec: BuildSpec """The build spec inferred for the target software component.""" ci_services: list[CIInfo] @@ -97,6 +100,7 @@ def __init__( # This attribute should be accessed via the `dynamic_data` property. self._dynamic_data: ChecksOutputs = ChecksOutputs( git_service=NoneGitService(), + repo_verification=[], build_spec=BuildSpec(tools=[], purl_tools=[]), ci_services=[], package_registries=[], diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py index fa6730a36..5c2b29368 100644 --- a/src/macaron/slsa_analyzer/analyzer.py +++ b/src/macaron/slsa_analyzer/analyzer.py @@ -42,6 +42,7 @@ extract_repo_and_commit_from_provenance, ) from macaron.repo_finder.provenance_finder import ProvenanceFinder, find_provenance_from_ci +from macaron.repo_verifier.repo_verifier import verify_repo from macaron.slsa_analyzer import git_url from macaron.slsa_analyzer.analyze_context import AnalyzeContext from macaron.slsa_analyzer.asset import VirtualReleaseAsset @@ -448,6 +449,8 @@ def run_single( git_service = self._determine_git_service(analyze_ctx) self._determine_ci_services(analyze_ctx, git_service) self._determine_build_tools(analyze_ctx, git_service) + if parsed_purl is not None: + self._verify_repository_link(parsed_purl, analyze_ctx) self._determine_package_registries(analyze_ctx) if not provenance_payload: @@ -1134,6 +1137,33 @@ def _determine_package_registries(self, analyze_ctx: AnalyzeContext) -> None: ) ) + def _verify_repository_link(self, parsed_purl: PackageURL, analyze_ctx: AnalyzeContext) -> None: + """Verify whether the claimed repository links back to the artifact.""" + if not analyze_ctx.component.repository: + logger.debug("The repository is not available. Skipping the repository verification.") + return + + if parsed_purl.namespace is None or parsed_purl.version is None: + logger.debug("The PURL is not complete. Skipping the repository verification.") + return + + build_tools = ( + analyze_ctx.dynamic_data["build_spec"]["tools"] or analyze_ctx.dynamic_data["build_spec"]["purl_tools"] + ) + + analyze_ctx.dynamic_data["repo_verification"] = [] + + for build_tool in build_tools: + verification_result = verify_repo( + namespace=parsed_purl.namespace, + name=parsed_purl.name, + version=parsed_purl.version, + reported_repo_url=analyze_ctx.component.repository.remote_path, + reported_repo_fs=analyze_ctx.component.repository.fs_path, + build_tool=build_tool, + ) + analyze_ctx.dynamic_data["repo_verification"].append(verification_result) + class DuplicateCmpError(DuplicateError): """This class is used for duplicated software component errors.""" diff --git a/src/macaron/slsa_analyzer/checks/maven_repo_verification_check.py b/src/macaron/slsa_analyzer/checks/maven_repo_verification_check.py new file mode 100644 index 000000000..006ca3e69 --- /dev/null +++ b/src/macaron/slsa_analyzer/checks/maven_repo_verification_check.py @@ -0,0 +1,132 @@ +# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""A check to determine whether the source repository of a maven package can be independently verified.""" + +import logging + +from packageurl import PackageURL +from sqlalchemy import ForeignKey, Integer, String +from sqlalchemy.orm import Mapped, mapped_column + +from macaron.database.table_definitions import CheckFacts +from macaron.repo_finder.repo_finder_deps_dev import DepsDevRepoFinder +from macaron.repo_verifier.repo_verifier_base import RepositoryVerificationStatus +from macaron.slsa_analyzer.analyze_context import AnalyzeContext +from macaron.slsa_analyzer.checks.base_check import BaseCheck +from macaron.slsa_analyzer.checks.check_result import CheckResultData, CheckResultType, Confidence +from macaron.slsa_analyzer.registry import registry + +logger: logging.Logger = logging.getLogger(__name__) + + +class MavenRepoVerificationFacts(CheckFacts): + """The ORM mapping for justifications in maven source repo check.""" + + __tablename__ = "_maven_repo_verification_check" + + #: The primary key. + id: Mapped[int] = mapped_column(ForeignKey("_check_facts.id"), primary_key=True) # noqa: A003 + + group: Mapped[str] = mapped_column(String, nullable=False) + artifact: Mapped[str] = mapped_column(String, nullable=False) + version: Mapped[str] = mapped_column(String, nullable=False) + + # Repository link identified by Macaron's repo finder. + repo_link: Mapped[str] = mapped_column(String, nullable=True) + + # Repository link identified by deps.dev. + deps_dev_repo_link: Mapped[str | None] = mapped_column(String, nullable=True) + + # Number of stars on the repository identified by deps.dev. + deps_dev_stars_count: Mapped[int | None] = mapped_column(Integer, nullable=True) + + # Number of forks on the repository identified by deps.dev. + deps_dev_fork_count: Mapped[int | None] = mapped_column(Integer, nullable=True) + + # The status of the check: passed, failed, or unknown. + status: Mapped[str] = mapped_column(String, nullable=False) + + # The reason for the status. + reason: Mapped[str] = mapped_column(String, nullable=False) + + # The build tool used to build the package. + build_tool: Mapped[str] = mapped_column(String, nullable=False) + + __mapper_args__ = { + "polymorphic_identity": "_maven_repo_verification_check", + } + + +class MavenRepoVerificationCheck(BaseCheck): + """Check whether the claims of a source repository provenance made by a maven package can be independently verified.""" + + def __init__(self) -> None: + """Initialize a check instance.""" + check_id = "mcn_maven_repo_verification_1" + description = ( + "Check whether the claims of a source repository provenance" + " made by a maven package can be independently verified." + ) + + super().__init__( + check_id=check_id, + description=description, + ) + + def run_check(self, ctx: AnalyzeContext) -> CheckResultData: + """Implement the check in this method. + + Parameters + ---------- + ctx : AnalyzeContext + The object containing processed data for the target repo. + + Returns + ------- + CheckResultData + The result of the check. + """ + if ctx.component.type != "maven": + return CheckResultData(result_tables=[], result_type=CheckResultType.UNKNOWN) + + deps_dev_repo_finder = DepsDevRepoFinder() + deps_dev_repo_link = deps_dev_repo_finder.find_repo(PackageURL.from_string(ctx.component.purl)) + deps_dev_repo_info = deps_dev_repo_finder.get_project_info(deps_dev_repo_link) + + stars_count: int | None = None + fork_count: int | None = None + + if deps_dev_repo_info: + stars_count = deps_dev_repo_info.get("starsCount") + fork_count = deps_dev_repo_info.get("forksCount") + + result_type = CheckResultType.UNKNOWN + result_tables: list[CheckFacts] = [] + for verification_result in ctx.dynamic_data.get("repo_verification", []): + result_tables.append( + MavenRepoVerificationFacts( + group=ctx.component.namespace, + artifact=ctx.component.name, + version=ctx.component.version, + repo_link=ctx.component.repository.remote_path if ctx.component.repository else None, + reason=verification_result.reason, + status=verification_result.status.value, + build_tool=verification_result.build_tool.name, + confidence=Confidence.MEDIUM, + deps_dev_repo_link=deps_dev_repo_link, + deps_dev_stars_count=stars_count, + deps_dev_fork_count=fork_count, + ) + ) + + match (result_type, verification_result.status): + case (_, RepositoryVerificationStatus.PASSED): + result_type = CheckResultType.PASSED + case (CheckResultType.UNKNOWN, RepositoryVerificationStatus.FAILED): + result_type = CheckResultType.FAILED + + return CheckResultData(result_tables=result_tables, result_type=result_type) + + +registry.register(MavenRepoVerificationCheck()) diff --git a/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py b/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py index d9ef77d1a..67a2b100b 100644 --- a/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py @@ -19,6 +19,56 @@ logger: logging.Logger = logging.getLogger(__name__) +# These are the code hosting platforms that are recognized by Sonatype for namespace verification in maven central. +RECOGNIZED_CODE_HOSTING_SERVICES = [ + "github", + "gitlab", + "bitbucket", + "gitee", +] + + +def same_organization(group_id_1: str, group_id_2: str) -> bool: + """Check if two maven group ids are from the same organization. + + Note: It is assumed that for recognized source platforms, the top level domain doesn't change the organization. + I.e., io.github.foo and com.github.foo are assumed to be from the same organization. + + Parameters + ---------- + group_id_1 : str + The first group id. + group_id_2 : str + The second group id. + + Returns + ------- + bool + ``True`` if the two group ids are from the same organization, ``False`` otherwise. + """ + if group_id_1 == group_id_2: + return True + + group_id_1_parts = group_id_1.split(".") + group_id_2_parts = group_id_2.split(".") + if min(len(group_id_1_parts), len(group_id_2_parts)) < 2: + return False + + # For groups ids that are under recognized maven namespaces, we only compare the first 3 parts. + # For example, io.github.foo.bar and io.github.foo are from the same organization (foo). + # Also, io.github.foo and com.github.foo are from the same organization. + if ( + group_id_1_parts[0] in {"io", "com"} + and group_id_1_parts[1] in RECOGNIZED_CODE_HOSTING_SERVICES + and group_id_2_parts[0] in {"io", "com"} + and group_id_2_parts[1] in RECOGNIZED_CODE_HOSTING_SERVICES + ): + if len(group_id_1_parts) >= 3 and len(group_id_2_parts) >= 3: + return group_id_1_parts[2] == group_id_2_parts[2] + return False + + return all(group_id_1_parts[index] == group_id_2_parts[index] for index in range(2)) + class MavenCentralRegistry(PackageRegistry): """This class implements a Maven Central package registry.""" diff --git a/tests/integration/cases/maven_repo_verification/config.ini b/tests/integration/cases/maven_repo_verification/config.ini new file mode 100644 index 000000000..90d59ea35 --- /dev/null +++ b/tests/integration/cases/maven_repo_verification/config.ini @@ -0,0 +1,6 @@ +# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +[analysis.checks] +exclude = +include = mcn_maven_repo_verification_1 diff --git a/tests/integration/cases/maven_repo_verification/policy_fail_1.dl b/tests/integration/cases/maven_repo_verification/policy_fail_1.dl new file mode 100644 index 000000000..0e36f0005 --- /dev/null +++ b/tests/integration/cases/maven_repo_verification/policy_fail_1.dl @@ -0,0 +1,10 @@ +/* Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. */ +/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ + +#include "prelude.dl" + +Policy("test_policy", component_id, "") :- + check_failed(component_id, "mcn_maven_repo_verification_1"). + +apply_policy_to("test_policy", component_id) :- + is_component(component_id, "pkg:maven/com.alibaba.ververica/flink-cep@1.17-vvr-8.0.8"). diff --git a/tests/integration/cases/maven_repo_verification/policy_pass_1.dl b/tests/integration/cases/maven_repo_verification/policy_pass_1.dl new file mode 100644 index 000000000..d43fd1f0f --- /dev/null +++ b/tests/integration/cases/maven_repo_verification/policy_pass_1.dl @@ -0,0 +1,10 @@ +/* Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. */ +/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ + +#include "prelude.dl" + +Policy("test_policy", component_id, "") :- + check_passed(component_id, "mcn_maven_repo_verification_1"). + +apply_policy_to("test_policy", component_id) :- + is_component(component_id, "pkg:maven/org.antlr/antlr4-maven-plugin@4.13.2"). diff --git a/tests/integration/cases/maven_repo_verification/policy_pass_2.dl b/tests/integration/cases/maven_repo_verification/policy_pass_2.dl new file mode 100644 index 000000000..49cd44d1e --- /dev/null +++ b/tests/integration/cases/maven_repo_verification/policy_pass_2.dl @@ -0,0 +1,10 @@ +/* Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. */ +/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ + +#include "prelude.dl" + +Policy("test_policy", component_id, "") :- + check_passed(component_id, "mcn_maven_repo_verification_1"). + +apply_policy_to("test_policy", component_id) :- + is_component(component_id, "pkg:maven/org.neo4j/cypher-parser-common@5.21.2"). diff --git a/tests/integration/cases/maven_repo_verification/test.yaml b/tests/integration/cases/maven_repo_verification/test.yaml new file mode 100644 index 000000000..2d084c954 --- /dev/null +++ b/tests/integration/cases/maven_repo_verification/test.yaml @@ -0,0 +1,44 @@ +# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +description: | + Integration tests for mcn_maven_repo_verification_1 check. + +tags: +- macaron-python-package +- macaron-docker-image + +steps: +- name: Run macaron analyze on passing case 1 + kind: analyze + options: + ini: config.ini + command_args: + - -purl + - pkg:maven/org.antlr/antlr4-maven-plugin@4.13.2 +- name: Verify that the check passsed + kind: verify + options: + policy: policy_pass_1.dl +- name: Run macaron analyze on passing case 2 + kind: analyze + options: + ini: config.ini + command_args: + - -purl + - pkg:maven/org.neo4j/cypher-parser-common@5.21.2 +- name: Verify that the check passsed + kind: verify + options: + policy: policy_pass_2.dl +- name: Run macaron analyze on failing case 1 + kind: analyze + options: + ini: config.ini + command_args: + - -purl + - pkg:maven/com.alibaba.ververica/flink-cep@1.17-vvr-8.0.8 +- name: Verify that the check passsed + kind: verify + options: + policy: policy_fail_1.dl diff --git a/tests/slsa_analyzer/checks/test_maven_repo_verification_check.py b/tests/slsa_analyzer/checks/test_maven_repo_verification_check.py new file mode 100644 index 000000000..9d12c751c --- /dev/null +++ b/tests/slsa_analyzer/checks/test_maven_repo_verification_check.py @@ -0,0 +1,82 @@ +# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Module to test the maven repository verification check.""" + +from pathlib import Path + +from macaron.repo_verifier.repo_verifier_base import RepositoryVerificationResult, RepositoryVerificationStatus +from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool +from macaron.slsa_analyzer.checks.check_result import CheckResultType +from macaron.slsa_analyzer.checks.maven_repo_verification_check import MavenRepoVerificationCheck +from macaron.slsa_analyzer.package_registry import PyPIRegistry +from macaron.slsa_analyzer.package_registry.maven_central_registry import MavenCentralRegistry +from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo +from tests.conftest import MockAnalyzeContext + +RESOURCE_PATH = Path(__file__).parent.joinpath("resources") + + +def test_repo_verification_pass(maven_tool: BaseBuildTool, macaron_path: Path) -> None: + """Test that the check passes when the repository is verified.""" + check = MavenRepoVerificationCheck() + + ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="", purl="pkg:maven/test/test") + maven_registry = MavenCentralRegistry() + ctx.dynamic_data["package_registries"] = [PackageRegistryInfo(maven_tool, maven_registry)] + ctx.dynamic_data["repo_verification"] = [ + RepositoryVerificationResult( + status=RepositoryVerificationStatus.PASSED, + reason="", + build_tool=maven_tool, + ) + ] + + assert check.run_check(ctx).result_type == CheckResultType.PASSED + + +def test_repo_verification_fail(maven_tool: BaseBuildTool, macaron_path: Path) -> None: + """Test that the check fails when the repository verification is failed.""" + check = MavenRepoVerificationCheck() + + ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="", purl="pkg:maven/test/test") + maven_registry = MavenCentralRegistry() + ctx.dynamic_data["package_registries"] = [PackageRegistryInfo(maven_tool, maven_registry)] + ctx.dynamic_data["repo_verification"] = [ + RepositoryVerificationResult( + status=RepositoryVerificationStatus.FAILED, + reason="", + build_tool=maven_tool, + ) + ] + + assert check.run_check(ctx).result_type == CheckResultType.FAILED + + +def test_repo_verification_unknown_for_unknown_repo_verification(maven_tool: BaseBuildTool, macaron_path: Path) -> None: + """Test that the check returns unknown when the repository verification is unknown.""" + check = MavenRepoVerificationCheck() + + ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="", purl="pkg:maven/test/test") + maven_registry = MavenCentralRegistry() + ctx.dynamic_data["package_registries"] = [PackageRegistryInfo(maven_tool, maven_registry)] + ctx.dynamic_data["repo_verification"] = [ + RepositoryVerificationResult( + status=RepositoryVerificationStatus.UNKNOWN, + reason="", + build_tool=maven_tool, + ) + ] + + assert check.run_check(ctx).result_type == CheckResultType.UNKNOWN + + +def test_repo_verification_unknown_for_unsupported_build_tools(pip_tool: BaseBuildTool, macaron_path: Path) -> None: + """Test that the check returns unknown for unsupported build tools.""" + check = MavenRepoVerificationCheck() + + ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="", purl="pkg:pypi/test/test") + pypi_registry = PyPIRegistry() + ctx.dynamic_data["package_registries"] = [PackageRegistryInfo(pip_tool, pypi_registry)] + + assert check.run_check(ctx).result_type == CheckResultType.UNKNOWN