diff --git a/CHANGELOG.md b/CHANGELOG.md index f37d38f..022b5fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,10 +10,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Upcoming changes... + +## [0.12.0] - 2025-11-04 ### Added - `calculateOppositeLineEndingHash()` method in `WinnowingUtils` to compute hash with opposite line endings (Unix ↔ Windows) - FH2 hash included in WFP output format as `fh2=` - Support for detecting CRLF (Windows), LF (Unix), and CR (legacy Mac) line endings +### Fixed +- Fixed WFP parsing issue ## [0.11.0] - 2025-05-26 ### Added @@ -126,4 +130,5 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 [0.9.0]: https://github.com/scanoss/scanoss.java/compare/v0.8.1...v0.9.0 [0.10.0]: https://github.com/scanoss/scanoss.java/compare/v0.9.0...v0.10.0 [0.10.1]: https://github.com/scanoss/scanoss.java/compare/v0.10.0...v0.10.1 -[0.11.0]: https://github.com/scanoss/scanoss.java/compare/v0.10.1...v0.11.0 \ No newline at end of file +[0.11.0]: https://github.com/scanoss/scanoss.java/compare/v0.10.1...v0.11.0 +[0.12.0]: https://github.com/scanoss/scanoss.java/compare/v0.11.0...v0.12.0 \ No newline at end of file diff --git a/pom.xml b/pom.xml index ac0951f..343504a 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ com.scanoss scanoss - 0.11.0 + 0.12.0 jar scanoss.java https://github.com/scanoss/scanoss.java diff --git a/src/main/java/com/scanoss/Winnowing.java b/src/main/java/com/scanoss/Winnowing.java index aa2c8eb..8ae3954 100644 --- a/src/main/java/com/scanoss/Winnowing.java +++ b/src/main/java/com/scanoss/Winnowing.java @@ -154,15 +154,16 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c } wfpBuilder.append(String.format("file=%s,%d,%s\n", fileMD5, contents.length, filename)); - if (binFile || this.skipSnippets || this.skipSnippets(filename, fileContents)) { - return wfpBuilder.toString(); - } String fh2 = WinnowingUtils.calculateOppositeLineEndingHash(contents); if (fh2 != null){ wfpBuilder.append(String.format("fh2=%s\n",fh2)); } + if (binFile || this.skipSnippets || this.skipSnippets(filename, fileContents)) { + return wfpBuilder.toString(); + } + if(this.isHpsm()){ wfpBuilder.append(String.format("hpsm=%s\n", Hpsm.calcHpsm(contents))); } @@ -194,11 +195,7 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c if (lastLine != line) { int obLength = outputBuilder.length(); if (obLength > 0) { - if (snippetLimit > 0 && obLength > snippetLimit) { - log.debug("Skipping snippet line as it's too big ({}): {}", filename, outputBuilder); - } else { - wfpBuilder.append(outputBuilder).append("\n"); - } + wfpBuilder.append(outputBuilder).append("\n"); } outputBuilder.delete(0, obLength); outputBuilder.append(String.format("%d=%s", line, minHashHex)); @@ -216,11 +213,7 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c } int obLength = outputBuilder.length(); if (obLength > 0) { - if (snippetLimit > 0 && obLength > snippetLimit) { - log.debug("Skipping snippet line as it's too big ({}) {} - {}: {}", filename, snippetLimit, obLength, outputBuilder); - } else { - wfpBuilder.append(outputBuilder).append("\n"); - } + wfpBuilder.append(outputBuilder).append("\n"); } return wfpBuilder.toString(); } @@ -275,8 +268,21 @@ private Boolean skipSnippets(@NonNull String filename, char[] contents) { log.trace("Skipping snippets as the file is too small: {} - {}", filename, contents.length); return true; } - if (contents[0] == '{' || contents[0] == '<') { - log.trace("Skipping snippets as the file appears to be JSON/XML/HTML: {}", filename); + //See https://github.com/scanoss/scanoss.py/blob/ede0477f3ea1b13a0147154b565b1bf6a72a6843/src/scanoss/winnowing.py#L248-L260 + //for python implementation reference + + // Create prefix from first MIN_FILE_SIZE-1 characters, lowercase and trimmed + String prefix = new String(contents, 0, ScanossConstants.MIN_FILE_SIZE - 1).toLowerCase().strip(); + + // Check for JSON files (starts with { or [) + if (prefix.charAt(0) == '{' || prefix.charAt(0) == '[') { + log.trace("Skipping snippets as the file appears to be JSON: {}", filename); + return true; + } + // Check for XML/HTML/AC3D files with explicit prefix matching + if (prefix.startsWith(" 0 && firstLineEnd > snippetLimit) { + log.trace("Skipping snippets due to first line being too long: {} - {} chars", filename, firstLineEnd); + return true; + } return false; } diff --git a/src/test/java/com/scanoss/TestWinnowing.java b/src/test/java/com/scanoss/TestWinnowing.java index c3b5fa0..449a324 100644 --- a/src/test/java/com/scanoss/TestWinnowing.java +++ b/src/test/java/com/scanoss/TestWinnowing.java @@ -165,13 +165,13 @@ public void TestWinnowingFileSkipSnippets() { String wfp = winnowing.wfpForFile(file, file); log.info("WFP for Json: {}", wfp); assertNotNull("Expected a result from WFP", wfp); - assertEquals("file=f8d52217f24ea77ff80a6b1f421d0959,229084,testing/data/non-source.json", wfp.trim()); + assertEquals("file=f8d52217f24ea77ff80a6b1f421d0959,229084,testing/data/non-source.json\nfh2=dcae9929f4436808df739f19804cb4d2", wfp.trim()); file = "testing/data/test-file.txt"; wfp = winnowing.wfpForFile(file, file); log.info("WFP for Json: {}", wfp); assertNotNull("Expected a result from WFP", wfp); - assertEquals("file=e3dd1a7915d51c8cd1498585e6cea41e,183,testing/data/test-file.txt", wfp.trim()); + assertEquals("file=e3dd1a7915d51c8cd1498585e6cea41e,183,testing/data/test-file.txt\nfh2=7de74202074d60759e60f408391e70c4", wfp.trim()); file = "testing/data/too-small.c"; wfp = winnowing.wfpForFile(file, file); @@ -183,7 +183,7 @@ public void TestWinnowingFileSkipSnippets() { wfp = winnowing.wfpForFile(file, file); log.info("WFP for Json: {}", wfp); assertNotNull("Expected a result from WFP", wfp); - assertEquals("file=d7cfce9cff6d109c6b0249233ee26368,345,testing/data/json-file.c", wfp.trim()); + assertEquals("file=d7cfce9cff6d109c6b0249233ee26368,345,testing/data/json-file.c\nfh2=56c80b467d3bcc13da74943a82c69724", wfp.trim()); file = "testing/data/source-file-with-long-line.c"; wfp = winnowing.wfpForFile(file, file);