Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ require (
github.com/knqyf263/go-rpmdb v0.0.0-20230301153543-ba94b245509b
github.com/opencontainers/go-digest v1.0.0
github.com/saferwall/pe v1.4.4
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d
github.com/sassoftware/go-rpmutils v0.2.0
github.com/vbatts/go-mtree v0.5.3
github.com/zyedidia/generic v1.2.2-0.20230320175451-4410d2372cb1
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -596,6 +596,8 @@ github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb
github.com/saferwall/pe v1.4.4 h1:Ml++7/2/Z1iKwV4zCsd1nIqTEAdUQKAetwbbcCarhOg=
github.com/saferwall/pe v1.4.4/go.mod h1:SNzv3cdgk8SBI0UwHfyTcdjawfdnN+nbydnEL7GZ25s=
github.com/sagikazarmark/crypt v0.3.0/go.mod h1:uD/D+6UF4SrIR1uGEv7bBNkNqLGqUr43MRiaGWX1Nig=
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA=
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
github.com/sassoftware/go-rpmutils v0.2.0 h1:pKW0HDYMFWQ5b4JQPiI3WI12hGsVoW0V8+GMoZiI/JE=
github.com/sassoftware/go-rpmutils v0.2.0/go.mod h1:TJJQYtLe/BeEmEjelI3b7xNZjzAukEkeWKmoakvaOoI=
github.com/scylladb/go-set v1.0.3-0.20200225121959-cc7b2070d91e h1:7q6NSFZDeGfvvtIRwBrU/aegEYJYmvev0cHAwo17zZQ=
Expand Down
39 changes: 37 additions & 2 deletions syft/pkg/cataloger/java/parse_pom_xml.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
package java

import (
"bytes"
"encoding/xml"
"fmt"
"io"
"reflect"
"regexp"
"strings"

"github.com/saintfish/chardet"
"github.com/vifraa/gopom"
"golang.org/x/net/html/charset"

Expand Down Expand Up @@ -99,16 +101,49 @@ func newPackageFromPom(pom gopom.Project, dep gopom.Dependency, locations ...fil
}

func decodePomXML(content io.Reader) (project gopom.Project, err error) {
decoder := xml.NewDecoder(content)
// prevent against warnings for "xml: encoding "iso-8859-1" declared but Decoder.CharsetReader is nil"
inputReader, err := getUtf8Reader(content)
if err != nil {
return project, fmt.Errorf("unable to read pom.xml: %w", err)
}

decoder := xml.NewDecoder(inputReader)
// when an xml file has a character set declaration (e.g. '<?xml version="1.0" encoding="ISO-8859-1"?>') read that and use the correct decoder
decoder.CharsetReader = charset.NewReaderLabel

if err := decoder.Decode(&project); err != nil {
return project, fmt.Errorf("unable to unmarshal pom.xml: %w", err)
}

return project, nil
}

func getUtf8Reader(content io.Reader) (io.Reader, error) {
pomContents, err := io.ReadAll(content)
if err != nil {
return nil, err
}

detector := chardet.NewTextDetector()
detection, err := detector.DetectBest(pomContents)

var inputReader io.Reader
if err == nil && detection != nil {
if detection.Charset == "UTF-8" {
inputReader = bytes.NewReader(pomContents)
} else {
inputReader, err = charset.NewReaderLabel(detection.Charset, bytes.NewReader(pomContents))
if err != nil {
return nil, fmt.Errorf("unable to get encoding: %w", err)
}
}
} else {
// we could not detect the encoding, but we want a valid file to read. Replace unreadable
// characters with the UTF-8 replacement character.
inputReader = strings.NewReader(strings.ToValidUTF8(string(pomContents), "�"))
}
return inputReader, nil
}

func pomParent(pom gopom.Project, parent *gopom.Parent) (result *pkg.PomParent) {
if parent == nil {
return nil
Expand Down
83 changes: 83 additions & 0 deletions syft/pkg/cataloger/java/parse_pom_xml_test.go
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
package java

import (
"encoding/base64"
"io"
"os"
"strings"
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/vifraa/gopom"

"github.com/anchore/syft/syft/file"
Expand Down Expand Up @@ -63,6 +67,60 @@ func Test_parserPomXML(t *testing.T) {
}
}

func Test_decodePomXML_surviveNonUtf8Encoding(t *testing.T) {
// regression for https://github.com/anchore/syft/issues/2044

// we are storing the base64 contents of the pom.xml file. We are doing this to prevent accidental changes to the
// file, which is extremely important for this test.

// for instance, even changing a single character in the file and saving in an IntelliJ IDE will automatically
// convert the file to UTF-8, which will break this test:

// xxd with the original pom.xml
// 00000780: 6964 3e0d 0a20 2020 2020 2020 2020 2020 id>..
// 00000790: 203c 6e61 6d65 3e4a e972 f46d 6520 4d69 <name>J.r.me Mi
// 000007a0: 7263 3c2f 6e61 6d65 3e0d 0a20 2020 2020 rc</name>..

// xxd with the pom.xml converted to UTF-8 (from a simple change with IntelliJ)
// 00000780: 6964 3e0d 0a20 2020 2020 2020 2020 2020 id>..
// 00000790: 203c 6e61 6d65 3e4a efbf bd72 efbf bd6d <name>J...r...m
// 000007a0: 6520 4d69 7263 3c2f 6e61 6d65 3e0d 0a20 e Mirc</name>..

// Note that the name "Jérôme Mirc" was originally interpreted as "J.r.me Mi" and after the save
// is now encoded as "J...r...m" which is not what we want (note the extra bytes for each non UTF-8 character.
// The original 0xe9 byte (é) was converted to 0xefbfbd (�) which is the UTF-8 replacement character.
// This is quite silly on the part of IntelliJ, but it is what it is.

cases := []struct {
name string
fixture string
}{
{
name: "undeclared encoding",
fixture: "test-fixtures/pom/undeclared-iso-8859-encoded-pom.xml.base64",
},
{
name: "declared encoding",
fixture: "test-fixtures/pom/declared-iso-8859-encoded-pom.xml.base64",
},
}

for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
fh, err := os.Open(c.fixture)
require.NoError(t, err)

decoder := base64.NewDecoder(base64.StdEncoding, fh)

proj, err := decodePomXML(decoder)

require.NoError(t, err)
require.NotEmpty(t, proj.Developers)
})
}

}

func Test_parseCommonsTextPomXMLProject(t *testing.T) {
tests := []struct {
input string
Expand Down Expand Up @@ -401,3 +459,28 @@ func Test_resolveProperty(t *testing.T) {
func stringPointer(s string) *string {
return &s
}

func Test_getUtf8Reader(t *testing.T) {
tests := []struct {
name string
contents string
}{
{
name: "unknown encoding",
// random binary contents
contents: "BkiJz02JyEWE0nXR6TH///9NicpJweEETIucJIgAAABJicxPjQwhTY1JCE05WQh0BU2J0eunTYshTIusJIAAAAAPHwBNOeV1BUUx2+tWTIlUJDhMiUwkSEyJRCQgSIl8JFBMiQ==",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
decoder := base64.NewDecoder(base64.StdEncoding, strings.NewReader(tt.contents))

got, err := getUtf8Reader(decoder)
require.NoError(t, err)
gotBytes, err := io.ReadAll(got)
require.NoError(t, err)
// if we couldn't decode the section as UTF-8, we should get a replacement character
assert.Contains(t, string(gotBytes), "�")
})
}
}
Loading