Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added

- Upcoming changes...

## [0.12.0] - 2025-11-04
### Added
- `calculateOppositeLineEndingHash()` method in `WinnowingUtils` to compute hash with opposite line endings (Unix ↔ Windows)
- FH2 hash included in WFP output format as `fh2=<hash>`
- Support for detecting CRLF (Windows), LF (Unix), and CR (legacy Mac) line endings
### Fixed
- Fixed WFP parsing issue

## [0.11.0] - 2025-05-26
### Added
Expand Down Expand Up @@ -126,4 +130,5 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
[0.9.0]: https://github.com/scanoss/scanoss.java/compare/v0.8.1...v0.9.0
[0.10.0]: https://github.com/scanoss/scanoss.java/compare/v0.9.0...v0.10.0
[0.10.1]: https://github.com/scanoss/scanoss.java/compare/v0.10.0...v0.10.1
[0.11.0]: https://github.com/scanoss/scanoss.java/compare/v0.10.1...v0.11.0
[0.11.0]: https://github.com/scanoss/scanoss.java/compare/v0.10.1...v0.11.0
[0.12.0]: https://github.com/scanoss/scanoss.java/compare/v0.11.0...v0.12.0
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

<groupId>com.scanoss</groupId>
<artifactId>scanoss</artifactId>
<version>0.11.0</version>
<version>0.12.0</version>
<packaging>jar</packaging>
<name>scanoss.java</name>
<url>https://github.com/scanoss/scanoss.java</url>
Expand Down
51 changes: 36 additions & 15 deletions src/main/java/com/scanoss/Winnowing.java
Original file line number Diff line number Diff line change
Expand Up @@ -154,15 +154,16 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
}

wfpBuilder.append(String.format("file=%s,%d,%s\n", fileMD5, contents.length, filename));
if (binFile || this.skipSnippets || this.skipSnippets(filename, fileContents)) {
return wfpBuilder.toString();
}

String fh2 = WinnowingUtils.calculateOppositeLineEndingHash(contents);
if (fh2 != null){
wfpBuilder.append(String.format("fh2=%s\n",fh2));
}

if (binFile || this.skipSnippets || this.skipSnippets(filename, fileContents)) {
return wfpBuilder.toString();
}

if(this.isHpsm()){
wfpBuilder.append(String.format("hpsm=%s\n", Hpsm.calcHpsm(contents)));
}
Expand Down Expand Up @@ -194,11 +195,7 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
if (lastLine != line) {
int obLength = outputBuilder.length();
if (obLength > 0) {
if (snippetLimit > 0 && obLength > snippetLimit) {
log.debug("Skipping snippet line as it's too big ({}): {}", filename, outputBuilder);
} else {
wfpBuilder.append(outputBuilder).append("\n");
}
wfpBuilder.append(outputBuilder).append("\n");
}
outputBuilder.delete(0, obLength);
outputBuilder.append(String.format("%d=%s", line, minHashHex));
Expand All @@ -216,11 +213,7 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
}
int obLength = outputBuilder.length();
if (obLength > 0) {
if (snippetLimit > 0 && obLength > snippetLimit) {
log.debug("Skipping snippet line as it's too big ({}) {} - {}: {}", filename, snippetLimit, obLength, outputBuilder);
} else {
wfpBuilder.append(outputBuilder).append("\n");
}
wfpBuilder.append(outputBuilder).append("\n");
}
return wfpBuilder.toString();
}
Expand Down Expand Up @@ -275,8 +268,21 @@ private Boolean skipSnippets(@NonNull String filename, char[] contents) {
log.trace("Skipping snippets as the file is too small: {} - {}", filename, contents.length);
return true;
}
if (contents[0] == '{' || contents[0] == '<') {
log.trace("Skipping snippets as the file appears to be JSON/XML/HTML: {}", filename);
//See https://github.com/scanoss/scanoss.py/blob/ede0477f3ea1b13a0147154b565b1bf6a72a6843/src/scanoss/winnowing.py#L248-L260
//for python implementation reference

// Create prefix from first MIN_FILE_SIZE-1 characters, lowercase and trimmed
String prefix = new String(contents, 0, ScanossConstants.MIN_FILE_SIZE - 1).toLowerCase().strip();

// Check for JSON files (starts with { or [)
if (prefix.charAt(0) == '{' || prefix.charAt(0) == '[') {
log.trace("Skipping snippets as the file appears to be JSON: {}", filename);
return true;
}
// Check for XML/HTML/AC3D files with explicit prefix matching
if (prefix.startsWith("<?xml") || prefix.startsWith("<html") ||
prefix.startsWith("<ac3d") || prefix.startsWith("<!doc")) {
log.trace("Skipping snippets as the file appears to be xml/html/binary: {}", filename);
return true;
}
if (!filename.isEmpty()) {
Expand All @@ -288,6 +294,21 @@ private Boolean skipSnippets(@NonNull String filename, char[] contents) {
}
}
}
// Check if first line is too long (matches Python implementation)
int firstLineEnd = 0;
for (int i = 0; i < contents.length; i++) {
if (contents[i] == '\n') {
firstLineEnd = i;
break;
}
}
if (firstLineEnd == 0) {
firstLineEnd = contents.length - 1; // No newline found, use length-1 (matching Python)
}
if (snippetLimit > 0 && firstLineEnd > snippetLimit) {
log.trace("Skipping snippets due to first line being too long: {} - {} chars", filename, firstLineEnd);
return true;
}
return false;
}

Expand Down
6 changes: 3 additions & 3 deletions src/test/java/com/scanoss/TestWinnowing.java
Original file line number Diff line number Diff line change
Expand Up @@ -165,13 +165,13 @@ public void TestWinnowingFileSkipSnippets() {
String wfp = winnowing.wfpForFile(file, file);
log.info("WFP for Json: {}", wfp);
assertNotNull("Expected a result from WFP", wfp);
assertEquals("file=f8d52217f24ea77ff80a6b1f421d0959,229084,testing/data/non-source.json", wfp.trim());
assertEquals("file=f8d52217f24ea77ff80a6b1f421d0959,229084,testing/data/non-source.json\nfh2=dcae9929f4436808df739f19804cb4d2", wfp.trim());

file = "testing/data/test-file.txt";
wfp = winnowing.wfpForFile(file, file);
log.info("WFP for Json: {}", wfp);
assertNotNull("Expected a result from WFP", wfp);
assertEquals("file=e3dd1a7915d51c8cd1498585e6cea41e,183,testing/data/test-file.txt", wfp.trim());
assertEquals("file=e3dd1a7915d51c8cd1498585e6cea41e,183,testing/data/test-file.txt\nfh2=7de74202074d60759e60f408391e70c4", wfp.trim());

file = "testing/data/too-small.c";
wfp = winnowing.wfpForFile(file, file);
Expand All @@ -183,7 +183,7 @@ public void TestWinnowingFileSkipSnippets() {
wfp = winnowing.wfpForFile(file, file);
log.info("WFP for Json: {}", wfp);
assertNotNull("Expected a result from WFP", wfp);
assertEquals("file=d7cfce9cff6d109c6b0249233ee26368,345,testing/data/json-file.c", wfp.trim());
assertEquals("file=d7cfce9cff6d109c6b0249233ee26368,345,testing/data/json-file.c\nfh2=56c80b467d3bcc13da74943a82c69724", wfp.trim());

file = "testing/data/source-file-with-long-line.c";
wfp = winnowing.wfpForFile(file, file);
Expand Down
Loading