Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added

- Upcoming changes...
### Added
- `calculateOppositeLineEndingHash()` method in `WinnowingUtils` to compute hash with opposite line endings (Unix ↔ Windows)
- FH2 hash included in WFP output format as `fh2=<hash>`
- Support for detecting CRLF (Windows), LF (Unix), and CR (legacy Mac) line endings

## [0.11.0] - 2025-05-26
### Added
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ public class Test {
The package also ships with a sample CLI. It can be run using the example script [scanoss-cli.sh](scanoss-cli.sh):

```bash
scanos-cli.sh -h
scanoss-cli.sh -h
```

### Custom Certificate
Expand Down Expand Up @@ -91,7 +91,7 @@ packaging/releasing an update.
The following commands are provided for incrementing version:

```bash
make inc_path
make inc_patch
make inc_minor
make inc_major
```
Expand Down
5 changes: 5 additions & 0 deletions src/main/java/com/scanoss/Winnowing.java
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,11 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
return wfpBuilder.toString();
}

String fh2 = WinnowingUtils.calculateOppositeLineEndingHash(contents);
if (fh2 != null){
wfpBuilder.append(String.format("fh2=%s\n",fh2));
}

if(this.isHpsm()){
wfpBuilder.append(String.format("hpsm=%s\n", Hpsm.calcHpsm(contents)));
}
Expand Down
143 changes: 143 additions & 0 deletions src/main/java/com/scanoss/utils/WinnowingUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,12 @@
*/
package com.scanoss.utils;

import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NonNull;
import org.apache.commons.codec.digest.DigestUtils;

import java.io.ByteArrayOutputStream;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
Expand All @@ -37,6 +41,17 @@
*/
public class WinnowingUtils {

/**
* Inner class to hold line ending detection results.
*/
@Getter
@AllArgsConstructor
public static class LineEndingInfo {
private final boolean hasCrlf;
private final boolean hasStandaloneLf;
private final boolean hasStandaloneCr;
}

/**
* Normalise the given character
*
Expand Down Expand Up @@ -95,4 +110,132 @@ public static Set<String> extractFilePathsFromWFPBlock(@NonNull String wfpBlock)

return paths;
}

/**
* Calculate hash for contents with opposite line endings.
* If the file is primarily Unix (LF), calculates Windows (CRLF) hash.
* If the file is primarily Windows (CRLF), calculates Unix (LF) hash.
*
* @param contents File contents as bytes
* @return Hash with opposite line endings as hex string, or null if no line endings detected
*/
public static String calculateOppositeLineEndingHash(byte[] contents) {
LineEndingInfo lineEndingInfo = detectLineEndings(contents);

// If no line endings detected, return null
if (!lineEndingInfo.hasCrlf && !lineEndingInfo.hasStandaloneLf && !lineEndingInfo.hasStandaloneCr) {
return null;
}

// Normalize all line endings to LF first
byte[] normalized = replaceSequence(contents, new byte[]{'\r', '\n'}, new byte[]{'\n'});
normalized = replaceSequence(normalized, new byte[]{'\r'}, new byte[]{'\n'});

byte[] oppositeContents;

// Determine the dominant line ending type
if (lineEndingInfo.hasCrlf && !lineEndingInfo.hasStandaloneLf && !lineEndingInfo.hasStandaloneCr) {
// File is Windows (CRLF) - produce Unix (LF) hash
oppositeContents = normalized;
} else {
// File is Unix (LF/CR) or mixed - produce Windows (CRLF) hash
oppositeContents = replaceSequence(normalized, new byte[]{'\n'}, new byte[]{'\r', '\n'});
}

return DigestUtils.md5Hex(oppositeContents);
}

/**
* Detect the types of line endings present in file contents.
*
* @param contents File contents as bytes
* @return LineEndingInfo indicating which line ending types are present
*/
private static LineEndingInfo detectLineEndings(byte[] contents) {
// Check for CRLF (Windows line endings)
boolean hasCrlf = containsSequence(contents, new byte[]{'\r', '\n'});

// Remove all CRLF sequences to check for standalone LF and CR
byte[] contentWithoutCrlf = replaceSequence(contents, new byte[]{'\r', '\n'}, new byte[]{});

// Check for standalone LF (not part of CRLF)
boolean hasStandaloneLf = containsSequence(contentWithoutCrlf, new byte[]{'\n'});

// Check for standalone CR (not part of CRLF)
boolean hasStandaloneCr = containsSequence(contentWithoutCrlf, new byte[]{'\r'});

return new LineEndingInfo(hasCrlf, hasStandaloneLf, hasStandaloneCr);
}

/**
* Check if a byte array contains a specific sequence of bytes.
*
* @param data The byte array to search in
* @param sequence The sequence to search for
* @return true if the sequence is found, false otherwise
*/
private static boolean containsSequence(byte[] data, byte[] sequence) {
if (sequence.length == 0 || data.length < sequence.length) {
return false;
}

for (int i = 0; i <= data.length - sequence.length; i++) {
boolean found = true;
for (int j = 0; j < sequence.length; j++) {
if (data[i + j] != sequence[j]) {
found = false;
break;
}
}
if (found) {
return true;
}
}
return false;
}

/**
* Replace all occurrences of a byte sequence with another sequence.
* Uses ByteArrayOutputStream for better performance compared to List<Byte>.
*
* @param data The original byte array
* @param search The sequence to search for
* @param replacement The sequence to replace with
* @return A new byte array with replacements made
*/
private static byte[] replaceSequence(byte[] data, byte[] search, byte[] replacement) {
if (search.length == 0) {
return data;
}

ByteArrayOutputStream result = new ByteArrayOutputStream(data.length);
int i = 0;

while (i < data.length) {
boolean found = false;

// Check if we have a match at current position
if (i <= data.length - search.length) {
found = true;
for (int j = 0; j < search.length; j++) {
if (data[i + j] != search[j]) {
found = false;
break;
}
}
}

if (found) {
// Add replacement bytes
result.write(replacement, 0, replacement.length);
i += search.length;
} else {
// Add current byte
result.write(data[i]);
i++;
}
}

return result.toByteArray();
}
}
1 change: 1 addition & 0 deletions src/test/java/com/scanoss/TestWinnowing.java
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ public void TestWinnowingContentsHPSM() {
assertNotNull(wfp);
assertFalse(wfp.isEmpty());
assertEquals("file=609a24b6cd27ef8108792ca459db1b28,293,local-file.c\n" +
"fh2=0bd0edfa2f3d4903c51b9fd910409942\n" +
"hpsm=df13c104d4\n" +
"3=0ed5027a,a9442399,d019b836\n" +
"4=613d56c0\n" +
Expand Down
106 changes: 106 additions & 0 deletions src/test/java/com/scanoss/utils/WinnowingUtilsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -84,4 +84,110 @@ public void testExtractFilePathsFromWFPBlock_ComplexCase_HandlesCorrectly() {
assertTrue(result.contains("/path/to/file2"));
assertTrue(result.contains("/path/to/file3"));
}

// Tests for calculateOppositeLineEndingHash
@Test
public void testCalculateOppositeLineEndingHash_UnixToWindows_ReturnsWindowsHash() {
// Unix file with LF line endings
String unixContent = "line1\nline2\nline3\n";
byte[] unixBytes = unixContent.getBytes();

// Expected: Windows content with CRLF
String windowsContent = "line1\r\nline2\r\nline3\r\n";
byte[] windowsBytes = windowsContent.getBytes();
String expectedHash = org.apache.commons.codec.digest.DigestUtils.md5Hex(windowsBytes);

String result = WinnowingUtils.calculateOppositeLineEndingHash(unixBytes);
assertEquals(expectedHash, result);
}

@Test
public void testCalculateOppositeLineEndingHash_WindowsToUnix_ReturnsUnixHash() {
// Windows file with CRLF line endings
String windowsContent = "line1\r\nline2\r\nline3\r\n";
byte[] windowsBytes = windowsContent.getBytes();

// Expected: Unix content with LF
String unixContent = "line1\nline2\nline3\n";
byte[] unixBytes = unixContent.getBytes();
String expectedHash = org.apache.commons.codec.digest.DigestUtils.md5Hex(unixBytes);

String result = WinnowingUtils.calculateOppositeLineEndingHash(windowsBytes);
assertEquals(expectedHash, result);
}

@Test
public void testCalculateOppositeLineEndingHash_NoLineEndings_ReturnsNull() {
// Content without any line endings
String content = "single line with no line endings";
byte[] bytes = content.getBytes();

String result = WinnowingUtils.calculateOppositeLineEndingHash(bytes);
assertNull(result);
}

@Test
public void testCalculateOppositeLineEndingHash_EmptyContent_ReturnsNull() {
byte[] emptyBytes = new byte[0];

String result = WinnowingUtils.calculateOppositeLineEndingHash(emptyBytes);
assertNull(result);
}

@Test
public void testCalculateOppositeLineEndingHash_MixedLineEndings_ReturnsWindowsHash() {
// Mixed line endings (LF and CRLF) - should produce Windows hash
String mixedContent = "line1\nline2\r\nline3\n";
byte[] mixedBytes = mixedContent.getBytes();

// Expected: all normalized to Windows CRLF
String windowsContent = "line1\r\nline2\r\nline3\r\n";
byte[] windowsBytes = windowsContent.getBytes();
String expectedHash = org.apache.commons.codec.digest.DigestUtils.md5Hex(windowsBytes);

String result = WinnowingUtils.calculateOppositeLineEndingHash(mixedBytes);
assertEquals(expectedHash, result);
}

@Test
public void testCalculateOppositeLineEndingHash_OnlyCarriageReturn_ReturnsWindowsHash() {
// Old Mac-style CR line endings
String crContent = "line1\rline2\rline3\r";
byte[] crBytes = crContent.getBytes();

// Expected: Windows CRLF
String windowsContent = "line1\r\nline2\r\nline3\r\n";
byte[] windowsBytes = windowsContent.getBytes();
String expectedHash = org.apache.commons.codec.digest.DigestUtils.md5Hex(windowsBytes);

String result = WinnowingUtils.calculateOppositeLineEndingHash(crBytes);
assertEquals(expectedHash, result);
}

@Test
public void testCalculateOppositeLineEndingHash_SingleLineWithLF_ReturnsWindowsHash() {
String unixContent = "single line\n";
byte[] unixBytes = unixContent.getBytes();

String windowsContent = "single line\r\n";
byte[] windowsBytes = windowsContent.getBytes();
String expectedHash = org.apache.commons.codec.digest.DigestUtils.md5Hex(windowsBytes);

String result = WinnowingUtils.calculateOppositeLineEndingHash(unixBytes);
assertEquals(expectedHash, result);
}

@Test
public void testCalculateOppositeLineEndingHash_MultipleConsecutiveLineEndings_HandlesCorrectly() {
// Multiple consecutive line endings (blank lines)
String unixContent = "line1\n\n\nline2\n";
byte[] unixBytes = unixContent.getBytes();

String windowsContent = "line1\r\n\r\n\r\nline2\r\n";
byte[] windowsBytes = windowsContent.getBytes();
String expectedHash = org.apache.commons.codec.digest.DigestUtils.md5Hex(windowsBytes);

String result = WinnowingUtils.calculateOppositeLineEndingHash(unixBytes);
assertEquals(expectedHash, result);
}
}
Loading