Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions cmd/syft/internal/options/catalog.go
Original file line number Diff line number Diff line change
Expand Up @@ -198,9 +198,10 @@ func (cfg Catalog) ToPackagesConfig() pkgcataloging.Config {
},
Nix: nix.DefaultConfig().
WithCaptureOwnedFiles(cfg.Nix.CaptureOwnedFiles),
Python: python.CatalogerConfig{
GuessUnpinnedRequirements: cfg.Python.GuessUnpinnedRequirements,
},
Python: python.DefaultCatalogerConfig().
WithSearchRemoteLicenses(*multiLevelOption(false, enrichmentEnabled(cfg.Enrich, task.Python), cfg.Python.SearchRemoteLicenses)).
WithPypiBaseURL(cfg.Python.PypiBaseURL).
WithGuessUnpinnedRequirements(*multiLevelOption(false, enrichmentEnabled(cfg.Enrich, task.Python), cfg.Python.GuessUnpinnedRequirements)),
JavaArchive: java.DefaultArchiveCatalogerConfig().
WithUseMavenLocalRepository(*multiLevelOption(false, enrichmentEnabled(cfg.Enrich, task.Java, task.Maven), cfg.Java.UseMavenLocalRepository)).
WithMavenLocalRepositoryDir(cfg.Java.MavenLocalRepositoryDir).
Expand Down Expand Up @@ -320,6 +321,7 @@ var publicisedEnrichmentOptions = []string{
task.Golang,
task.Java,
task.JavaScript,
task.Python,
}

func enrichmentEnabled(enrichDirectives []string, features ...string) *bool {
Expand Down
6 changes: 5 additions & 1 deletion cmd/syft/internal/options/python.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,18 @@ package options
import "github.com/anchore/clio"

type pythonConfig struct {
GuessUnpinnedRequirements bool `json:"guess-unpinned-requirements" yaml:"guess-unpinned-requirements" mapstructure:"guess-unpinned-requirements"`
SearchRemoteLicenses *bool `json:"search-remote-licenses" yaml:"search-remote-licenses" mapstructure:"search-remote-licenses"`
PypiBaseURL string `json:"pypi-base-url" yaml:"pypi-base-url" mapstructure:"pypi-base-url"`
GuessUnpinnedRequirements *bool `json:"guess-unpinned-requirements" yaml:"guess-unpinned-requirements" mapstructure:"guess-unpinned-requirements"`
}

var _ interface {
clio.FieldDescriber
} = (*pythonConfig)(nil)

func (o *pythonConfig) DescribeFields(descriptions clio.FieldDescriptionSet) {
descriptions.Add(&o.SearchRemoteLicenses, `enables Syft to use the network to fill in more detailed license information`)
descriptions.Add(&o.PypiBaseURL, `base Pypi url to use`)
descriptions.Add(&o.GuessUnpinnedRequirements, `when running across entries in requirements.txt that do not specify a specific version
(e.g. "sqlalchemy >= 1.0.0, <= 2.0.0, != 3.0.0, <= 3.0.0"), attempt to guess what the version could
be based on the version requirements specified (e.g. "1.0.0"). When enabled the lowest expressible version
Expand Down
7 changes: 5 additions & 2 deletions internal/task/package_tasks.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ const (
JavaScript = "javascript"
Node = "node"
NPM = "npm"

// Python ecosystem labels
Python = "python"
)

//nolint:funlen
Expand Down Expand Up @@ -109,7 +112,7 @@ func DefaultPackageTaskFactories() Factories {
func(cfg CatalogingFactoryConfig) pkg.Cataloger {
return python.NewPackageCataloger(cfg.PackagesConfig.Python)
},
pkgcataloging.DeclaredTag, pkgcataloging.DirectoryTag, pkgcataloging.LanguageTag, "python",
pkgcataloging.DeclaredTag, pkgcataloging.DirectoryTag, pkgcataloging.LanguageTag, Python,
),
newSimplePackageTaskFactory(ruby.NewGemFileLockCataloger, pkgcataloging.DeclaredTag, pkgcataloging.DirectoryTag, pkgcataloging.LanguageTag, "ruby", "gem"),
newSimplePackageTaskFactory(ruby.NewGemSpecCataloger, pkgcataloging.DeclaredTag, pkgcataloging.DirectoryTag, pkgcataloging.LanguageTag, "ruby", "gem", "gemspec"),
Expand All @@ -127,7 +130,7 @@ func DefaultPackageTaskFactories() Factories {
pkgcataloging.InstalledTag, pkgcataloging.ImageTag, pkgcataloging.DirectoryTag, pkgcataloging.LanguageTag, "dotnet", "c#",
),
newSimplePackageTaskFactory(dotnet.NewDotnetPackagesLockCataloger, pkgcataloging.DeclaredTag, pkgcataloging.ImageTag, pkgcataloging.DirectoryTag, pkgcataloging.LanguageTag, "dotnet", "c#"),
newSimplePackageTaskFactory(python.NewInstalledPackageCataloger, pkgcataloging.DirectoryTag, pkgcataloging.InstalledTag, pkgcataloging.ImageTag, pkgcataloging.LanguageTag, "python"),
newSimplePackageTaskFactory(python.NewInstalledPackageCataloger, pkgcataloging.DirectoryTag, pkgcataloging.InstalledTag, pkgcataloging.ImageTag, pkgcataloging.LanguageTag, Python),
newPackageTaskFactory(
func(cfg CatalogingFactoryConfig) pkg.Cataloger {
return golang.NewGoModuleBinaryCataloger(cfg.PackagesConfig.Golang)
Expand Down
3 changes: 2 additions & 1 deletion syft/internal/fileresolver/metadata_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@ import (
"os"
"testing"

"github.com/anchore/stereoscope/pkg/file"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"github.com/anchore/stereoscope/pkg/file"
)

func TestFileMetadataFromPath(t *testing.T) {
Expand Down
31 changes: 12 additions & 19 deletions syft/pkg/cataloger/python/cataloger.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,28 +10,21 @@ import (

const eggInfoGlob = "**/*.egg-info"

type CatalogerConfig struct {
// GuessUnpinnedRequirements attempts to infer package versions from version constraints when no explicit version is specified in requirements files.
// app-config: python.guess-unpinned-requirements
GuessUnpinnedRequirements bool `yaml:"guess-unpinned-requirements" json:"guess-unpinned-requirements" mapstructure:"guess-unpinned-requirements"`
}

func DefaultCatalogerConfig() CatalogerConfig {
return CatalogerConfig{
GuessUnpinnedRequirements: false,
}
}

// NewPackageCataloger returns a new cataloger for python packages referenced from poetry lock files, requirements.txt files, and setup.py files.
func NewPackageCataloger(cfg CatalogerConfig) pkg.Cataloger {
rqp := newRequirementsParser(cfg)
poetryLockParser := newPoetryLockParser(cfg)
pipfileLockParser := newPipfileLockParser(cfg)
setupFileParser := newSetupFileParser(cfg)
uvLockParser := newUvLockParser(cfg)
pdmLockParser := newPdmLockParser(cfg)
requirementsFileParser := newRequirementsParser(cfg)
return generic.NewCataloger("python-package-cataloger").
WithParserByGlobs(rqp.parseRequirementsTxt, "**/*requirements*.txt").
WithParserByGlobs(parsePoetryLock, "**/poetry.lock").
WithParserByGlobs(parsePipfileLock, "**/Pipfile.lock").
WithParserByGlobs(parseSetup, "**/setup.py").
WithParserByGlobs(parseUvLock, "**/uv.lock").
WithParserByGlobs(parsePdmLock, "**/pdm.lock")
WithParserByGlobs(requirementsFileParser.parseRequirementsTxt, "**/*requirements*.txt").
WithParserByGlobs(poetryLockParser.parsePoetryLock, "**/poetry.lock").
WithParserByGlobs(pipfileLockParser.parsePipfileLock, "**/Pipfile.lock").
WithParserByGlobs(setupFileParser.parseSetupFile, "**/setup.py").
WithParserByGlobs(uvLockParser.parseUvLock, "**/uv.lock").
WithParserByGlobs(pdmLockParser.parsePdmLock, "**/pdm.lock")
}

// NewInstalledPackageCataloger returns a new cataloger for python packages within egg or wheel installation directories.
Expand Down
40 changes: 40 additions & 0 deletions syft/pkg/cataloger/python/config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package python

const pypiBaseURL = "https://pypi.org/pypi"

type CatalogerConfig struct {
// GuessUnpinnedRequirements attempts to infer package versions from version constraints when no explicit version is specified in requirements files.
// app-config: python.guess-unpinned-requirements
GuessUnpinnedRequirements bool `yaml:"guess-unpinned-requirements" json:"guess-unpinned-requirements" mapstructure:"guess-unpinned-requirements"`
// SearchRemoteLicenses enables querying the NPM registry API to retrieve license information for packages that are missing license data in their local metadata.
// app-config: python.search-remote-licenses
SearchRemoteLicenses bool `json:"search-remote-licenses" yaml:"search-remote-licenses" mapstructure:"search-remote-licenses"`
// PypiBaseURL specifies the base URL for the Pypi registry API used when searching for remote license information.
// app-config: python.pypi-base-url
PypiBaseURL string `json:"pypi-base-url" yaml:"pypi-base-url" mapstructure:"pypi-base-url"`
}

func DefaultCatalogerConfig() CatalogerConfig {
return CatalogerConfig{
GuessUnpinnedRequirements: false,
SearchRemoteLicenses: false,
PypiBaseURL: pypiBaseURL,
}
}

func (c CatalogerConfig) WithSearchRemoteLicenses(input bool) CatalogerConfig {
c.SearchRemoteLicenses = input
return c
}

func (c CatalogerConfig) WithGuessUnpinnedRequirements(input bool) CatalogerConfig {
c.GuessUnpinnedRequirements = input
return c
}

func (c CatalogerConfig) WithPypiBaseURL(input string) CatalogerConfig {
if input != "" {
c.PypiBaseURL = input
}
return c
}
4 changes: 3 additions & 1 deletion syft/pkg/cataloger/python/dependency_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package python

import (
"context"
"os"
"testing"

Expand Down Expand Up @@ -259,7 +260,8 @@ func Test_poetryLockDependencySpecifier_againstPoetryLock(t *testing.T) {
fh, err := os.Open(tt.fixture)
require.NoError(t, err)

pkgs, err := poetryLockPackages(file.NewLocationReadCloser(file.NewLocation(tt.fixture), fh))
plp := newPoetryLockParser(DefaultCatalogerConfig())
pkgs, err := plp.poetryLockPackages(context.TODO(), file.NewLocationReadCloser(file.NewLocation(tt.fixture), fh))
require.NoError(t, err)

var got []dependency.Specification
Expand Down
131 changes: 131 additions & 0 deletions syft/pkg/cataloger/python/license.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
package python

import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"time"

"github.com/anchore/syft/internal/cache"
"github.com/anchore/syft/internal/log"
"github.com/anchore/syft/syft/pkg"
)

type pythonLicenseResolver struct {
catalogerConfig CatalogerConfig
licenseCache cache.Resolver[[]pkg.License]
}

func newPythonLicenseResolver(config CatalogerConfig) pythonLicenseResolver {
return pythonLicenseResolver{
licenseCache: cache.GetResolverCachingErrors[[]pkg.License]("python", "v1"),
catalogerConfig: config,
}
}

func (lr *pythonLicenseResolver) getLicenses(ctx context.Context, packageName string, packageVersion string) pkg.LicenseSet {
var licenseSet pkg.LicenseSet

if lr.catalogerConfig.SearchRemoteLicenses {
licenses, err := lr.getLicensesFromRemote(ctx, packageName, packageVersion)
if err == nil && licenses != nil {
licenseSet = pkg.NewLicenseSet(licenses...)
}
if err != nil {
log.Debugf("unable to extract licenses from pypi registry for package %s:%s: %+v", packageName, packageVersion, err)
}
}
return licenseSet
}

func (lr *pythonLicenseResolver) getLicensesFromRemote(ctx context.Context, packageName string, packageVersion string) ([]pkg.License, error) {
return lr.licenseCache.Resolve(fmt.Sprintf("%s/%s", packageName, packageVersion), func() ([]pkg.License, error) {
license, err := getLicenseFromPypiRegistry(lr.catalogerConfig.PypiBaseURL, packageName, packageVersion)
if err == nil && license != "" {
licenses := pkg.NewLicensesFromValuesWithContext(ctx, license)
return licenses, nil
}
if err != nil {
log.Debugf("unable to extract licenses from pypi registry for package %s:%s: %+v", packageName, packageVersion, err)
}
return nil, err
})
}

func formatPypiRegistryURL(baseURL, packageName, version string) (requestURL string, err error) {
if packageName == "" {
return "", fmt.Errorf("unable to format pypi request for a blank package name")
}

urlPath := []string{packageName, version, "json"}
requestURL, err = url.JoinPath(baseURL, urlPath...)
if err != nil {
return requestURL, fmt.Errorf("unable to format pypi request for pkg:version %s%s; %w", packageName, version, err)
}
return requestURL, nil
}

func getLicenseFromPypiRegistry(baseURL, packageName, version string) (string, error) {
// "https://pypi.org/pypi/%s/%s/json", packageName, version
requestURL, err := formatPypiRegistryURL(baseURL, packageName, version)
if err != nil {
return "", fmt.Errorf("unable to format pypi request for pkg:version %s%s; %w", packageName, version, err)
}
log.WithFields("url", requestURL).Info("downloading python package from pypi")

pypiRequest, err := http.NewRequest(http.MethodGet, requestURL, nil)
if err != nil {
return "", fmt.Errorf("unable to format remote request: %w", err)
}

httpClient := &http.Client{
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should have some common configuration for timeout options, retry behavior possibly, rate limiting, etc.. This is becoming more important as we add more online resolution. We also might think about adding these features in a way that could be used in parallel and to enhance existing SBOMs rather than only at creation time. I don't think that has to happen as part of this PR, but I think we're getting to the point that we need to start being conscious of some problems people will start running into more frequently like the Maven rate limiting.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, that makes sense.

Timeout: time.Second * 10,
}

resp, err := httpClient.Do(pypiRequest)
if err != nil {
return "", fmt.Errorf("unable to get package from pypi registry: %w", err)
}
defer func() {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this precede the status code check? there can be response bodies that need to be closed with other statuses, I think

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right.

if err := resp.Body.Close(); err != nil {
log.Errorf("unable to close body: %+v", err)
}
}()

if resp.StatusCode != 200 {
return "", fmt.Errorf("unable to get package from pypi registry")
}

bytes, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("unable to parse package from pypi registry: %w", err)
}

dec := json.NewDecoder(strings.NewReader(string(bytes)))

// Read "license" from the response
var pypiResponse struct {
Info struct {
License string `json:"license"`
LicenseExpression string `json:"license_expression"`
} `json:"info"`
}

if err := dec.Decode(&pypiResponse); err != nil {
return "", fmt.Errorf("unable to parse license from pypi registry: %w", err)
}

var license string
if pypiResponse.Info.LicenseExpression != "" {
license = pypiResponse.Info.LicenseExpression
} else {
license = pypiResponse.Info.License
}
log.Tracef("Retrieved License: %s", license)

return license, nil
}
Loading
Loading