Skip to content

Commit c0b79d3

Browse files
committed
chore:SP-3579 Implements fh2 opposite line ending hash calculation
1 parent 42e702a commit c0b79d3

File tree

7 files changed

+262
-3
lines changed

7 files changed

+262
-3
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1010
### Added
1111

1212
- Upcoming changes...
13+
### Added
14+
- `calculateOppositeLineEndingHash()` method in `WinnowingUtils` to compute hash with opposite line endings (Unix ↔ Windows)
15+
- FH2 hash included in WFP output format as `fh2=<hash>`
16+
- Support for detecting CRLF (Windows), LF (Unix), and CR (legacy Mac) line endings
1317

1418
## [0.11.0] - 2025-05-26
1519
### Added

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ public class Test {
4949
The package also ships with a sample CLI. It can be run using the example script [scanoss-cli.sh](scanoss-cli.sh):
5050

5151
```bash
52-
scanos-cli.sh -h
52+
scanoss-cli.sh -h
5353
```
5454

5555
### Custom Certificate
@@ -91,7 +91,7 @@ packaging/releasing an update.
9191
The following commands are provided for incrementing version:
9292

9393
```bash
94-
make inc_path
94+
make inc_patch
9595
make inc_minor
9696
make inc_major
9797
```

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
<groupId>com.scanoss</groupId>
88
<artifactId>scanoss</artifactId>
9-
<version>0.11.0</version>
9+
<version>0.12.0</version>
1010
<packaging>jar</packaging>
1111
<name>scanoss.java</name>
1212
<url>https://github.com/scanoss/scanoss.java</url>

src/main/java/com/scanoss/Winnowing.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,11 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
158158
return wfpBuilder.toString();
159159
}
160160

161+
String fh2 = WinnowingUtils.calculateOppositeLineEndingHash(contents);
162+
if (fh2 != null){
163+
wfpBuilder.append(String.format("fh2=%s\n",fh2));
164+
}
165+
161166
if(this.isHpsm()){
162167
wfpBuilder.append(String.format("hpsm=%s\n", Hpsm.calcHpsm(contents)));
163168
}

src/main/java/com/scanoss/utils/WinnowingUtils.java

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,12 @@
2222
*/
2323
package com.scanoss.utils;
2424

25+
import lombok.AllArgsConstructor;
26+
import lombok.Getter;
2527
import lombok.NonNull;
28+
import org.apache.commons.codec.digest.DigestUtils;
2629

30+
import java.io.ByteArrayOutputStream;
2731
import java.util.HashSet;
2832
import java.util.Set;
2933
import java.util.regex.Matcher;
@@ -37,6 +41,17 @@
3741
*/
3842
public class WinnowingUtils {
3943

44+
/**
45+
* Inner class to hold line ending detection results.
46+
*/
47+
@Getter
48+
@AllArgsConstructor
49+
public static class LineEndingInfo {
50+
private final boolean hasCrlf;
51+
private final boolean hasStandaloneLf;
52+
private final boolean hasStandaloneCr;
53+
}
54+
4055
/**
4156
* Normalise the given character
4257
*
@@ -95,4 +110,132 @@ public static Set<String> extractFilePathsFromWFPBlock(@NonNull String wfpBlock)
95110

96111
return paths;
97112
}
113+
114+
/**
115+
* Calculate hash for contents with opposite line endings.
116+
* If the file is primarily Unix (LF), calculates Windows (CRLF) hash.
117+
* If the file is primarily Windows (CRLF), calculates Unix (LF) hash.
118+
*
119+
* @param contents File contents as bytes
120+
* @return Hash with opposite line endings as hex string, or null if no line endings detected
121+
*/
122+
public static String calculateOppositeLineEndingHash(byte[] contents) {
123+
LineEndingInfo lineEndingInfo = detectLineEndings(contents);
124+
125+
// If no line endings detected, return null
126+
if (!lineEndingInfo.hasCrlf && !lineEndingInfo.hasStandaloneLf && !lineEndingInfo.hasStandaloneCr) {
127+
return null;
128+
}
129+
130+
// Normalize all line endings to LF first
131+
byte[] normalized = replaceSequence(contents, new byte[]{'\r', '\n'}, new byte[]{'\n'});
132+
normalized = replaceSequence(normalized, new byte[]{'\r'}, new byte[]{'\n'});
133+
134+
byte[] oppositeContents;
135+
136+
// Determine the dominant line ending type
137+
if (lineEndingInfo.hasCrlf && !lineEndingInfo.hasStandaloneLf && !lineEndingInfo.hasStandaloneCr) {
138+
// File is Windows (CRLF) - produce Unix (LF) hash
139+
oppositeContents = normalized;
140+
} else {
141+
// File is Unix (LF/CR) or mixed - produce Windows (CRLF) hash
142+
oppositeContents = replaceSequence(normalized, new byte[]{'\n'}, new byte[]{'\r', '\n'});
143+
}
144+
145+
return DigestUtils.md5Hex(oppositeContents);
146+
}
147+
148+
/**
149+
* Detect the types of line endings present in file contents.
150+
*
151+
* @param contents File contents as bytes
152+
* @return LineEndingInfo indicating which line ending types are present
153+
*/
154+
private static LineEndingInfo detectLineEndings(byte[] contents) {
155+
// Check for CRLF (Windows line endings)
156+
boolean hasCrlf = containsSequence(contents, new byte[]{'\r', '\n'});
157+
158+
// Remove all CRLF sequences to check for standalone LF and CR
159+
byte[] contentWithoutCrlf = replaceSequence(contents, new byte[]{'\r', '\n'}, new byte[]{});
160+
161+
// Check for standalone LF (not part of CRLF)
162+
boolean hasStandaloneLf = containsSequence(contentWithoutCrlf, new byte[]{'\n'});
163+
164+
// Check for standalone CR (not part of CRLF)
165+
boolean hasStandaloneCr = containsSequence(contentWithoutCrlf, new byte[]{'\r'});
166+
167+
return new LineEndingInfo(hasCrlf, hasStandaloneLf, hasStandaloneCr);
168+
}
169+
170+
/**
171+
* Check if a byte array contains a specific sequence of bytes.
172+
*
173+
* @param data The byte array to search in
174+
* @param sequence The sequence to search for
175+
* @return true if the sequence is found, false otherwise
176+
*/
177+
private static boolean containsSequence(byte[] data, byte[] sequence) {
178+
if (sequence.length == 0 || data.length < sequence.length) {
179+
return false;
180+
}
181+
182+
for (int i = 0; i <= data.length - sequence.length; i++) {
183+
boolean found = true;
184+
for (int j = 0; j < sequence.length; j++) {
185+
if (data[i + j] != sequence[j]) {
186+
found = false;
187+
break;
188+
}
189+
}
190+
if (found) {
191+
return true;
192+
}
193+
}
194+
return false;
195+
}
196+
197+
/**
198+
* Replace all occurrences of a byte sequence with another sequence.
199+
* Uses ByteArrayOutputStream for better performance compared to List<Byte>.
200+
*
201+
* @param data The original byte array
202+
* @param search The sequence to search for
203+
* @param replacement The sequence to replace with
204+
* @return A new byte array with replacements made
205+
*/
206+
private static byte[] replaceSequence(byte[] data, byte[] search, byte[] replacement) {
207+
if (search.length == 0) {
208+
return data;
209+
}
210+
211+
ByteArrayOutputStream result = new ByteArrayOutputStream(data.length);
212+
int i = 0;
213+
214+
while (i < data.length) {
215+
boolean found = false;
216+
217+
// Check if we have a match at current position
218+
if (i <= data.length - search.length) {
219+
found = true;
220+
for (int j = 0; j < search.length; j++) {
221+
if (data[i + j] != search[j]) {
222+
found = false;
223+
break;
224+
}
225+
}
226+
}
227+
228+
if (found) {
229+
// Add replacement bytes
230+
result.write(replacement, 0, replacement.length);
231+
i += search.length;
232+
} else {
233+
// Add current byte
234+
result.write(data[i]);
235+
i++;
236+
}
237+
}
238+
239+
return result.toByteArray();
240+
}
98241
}

src/test/java/com/scanoss/TestWinnowing.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ public void TestWinnowingContentsHPSM() {
105105
assertNotNull(wfp);
106106
assertFalse(wfp.isEmpty());
107107
assertEquals("file=609a24b6cd27ef8108792ca459db1b28,293,local-file.c\n" +
108+
"fh2=0bd0edfa2f3d4903c51b9fd910409942\n" +
108109
"hpsm=df13c104d4\n" +
109110
"3=0ed5027a,a9442399,d019b836\n" +
110111
"4=613d56c0\n" +

src/test/java/com/scanoss/utils/WinnowingUtilsTest.java

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,4 +84,110 @@ public void testExtractFilePathsFromWFPBlock_ComplexCase_HandlesCorrectly() {
8484
assertTrue(result.contains("/path/to/file2"));
8585
assertTrue(result.contains("/path/to/file3"));
8686
}
87+
88+
// Tests for calculateOppositeLineEndingHash
89+
@Test
90+
public void testCalculateOppositeLineEndingHash_UnixToWindows_ReturnsWindowsHash() {
91+
// Unix file with LF line endings
92+
String unixContent = "line1\nline2\nline3\n";
93+
byte[] unixBytes = unixContent.getBytes();
94+
95+
// Expected: Windows content with CRLF
96+
String windowsContent = "line1\r\nline2\r\nline3\r\n";
97+
byte[] windowsBytes = windowsContent.getBytes();
98+
String expectedHash = org.apache.commons.codec.digest.DigestUtils.md5Hex(windowsBytes);
99+
100+
String result = WinnowingUtils.calculateOppositeLineEndingHash(unixBytes);
101+
assertEquals(expectedHash, result);
102+
}
103+
104+
@Test
105+
public void testCalculateOppositeLineEndingHash_WindowsToUnix_ReturnsUnixHash() {
106+
// Windows file with CRLF line endings
107+
String windowsContent = "line1\r\nline2\r\nline3\r\n";
108+
byte[] windowsBytes = windowsContent.getBytes();
109+
110+
// Expected: Unix content with LF
111+
String unixContent = "line1\nline2\nline3\n";
112+
byte[] unixBytes = unixContent.getBytes();
113+
String expectedHash = org.apache.commons.codec.digest.DigestUtils.md5Hex(unixBytes);
114+
115+
String result = WinnowingUtils.calculateOppositeLineEndingHash(windowsBytes);
116+
assertEquals(expectedHash, result);
117+
}
118+
119+
@Test
120+
public void testCalculateOppositeLineEndingHash_NoLineEndings_ReturnsNull() {
121+
// Content without any line endings
122+
String content = "single line with no line endings";
123+
byte[] bytes = content.getBytes();
124+
125+
String result = WinnowingUtils.calculateOppositeLineEndingHash(bytes);
126+
assertNull(result);
127+
}
128+
129+
@Test
130+
public void testCalculateOppositeLineEndingHash_EmptyContent_ReturnsNull() {
131+
byte[] emptyBytes = new byte[0];
132+
133+
String result = WinnowingUtils.calculateOppositeLineEndingHash(emptyBytes);
134+
assertNull(result);
135+
}
136+
137+
@Test
138+
public void testCalculateOppositeLineEndingHash_MixedLineEndings_ReturnsWindowsHash() {
139+
// Mixed line endings (LF and CRLF) - should produce Windows hash
140+
String mixedContent = "line1\nline2\r\nline3\n";
141+
byte[] mixedBytes = mixedContent.getBytes();
142+
143+
// Expected: all normalized to Windows CRLF
144+
String windowsContent = "line1\r\nline2\r\nline3\r\n";
145+
byte[] windowsBytes = windowsContent.getBytes();
146+
String expectedHash = org.apache.commons.codec.digest.DigestUtils.md5Hex(windowsBytes);
147+
148+
String result = WinnowingUtils.calculateOppositeLineEndingHash(mixedBytes);
149+
assertEquals(expectedHash, result);
150+
}
151+
152+
@Test
153+
public void testCalculateOppositeLineEndingHash_OnlyCarriageReturn_ReturnsWindowsHash() {
154+
// Old Mac-style CR line endings
155+
String crContent = "line1\rline2\rline3\r";
156+
byte[] crBytes = crContent.getBytes();
157+
158+
// Expected: Windows CRLF
159+
String windowsContent = "line1\r\nline2\r\nline3\r\n";
160+
byte[] windowsBytes = windowsContent.getBytes();
161+
String expectedHash = org.apache.commons.codec.digest.DigestUtils.md5Hex(windowsBytes);
162+
163+
String result = WinnowingUtils.calculateOppositeLineEndingHash(crBytes);
164+
assertEquals(expectedHash, result);
165+
}
166+
167+
@Test
168+
public void testCalculateOppositeLineEndingHash_SingleLineWithLF_ReturnsWindowsHash() {
169+
String unixContent = "single line\n";
170+
byte[] unixBytes = unixContent.getBytes();
171+
172+
String windowsContent = "single line\r\n";
173+
byte[] windowsBytes = windowsContent.getBytes();
174+
String expectedHash = org.apache.commons.codec.digest.DigestUtils.md5Hex(windowsBytes);
175+
176+
String result = WinnowingUtils.calculateOppositeLineEndingHash(unixBytes);
177+
assertEquals(expectedHash, result);
178+
}
179+
180+
@Test
181+
public void testCalculateOppositeLineEndingHash_MultipleConsecutiveLineEndings_HandlesCorrectly() {
182+
// Multiple consecutive line endings (blank lines)
183+
String unixContent = "line1\n\n\nline2\n";
184+
byte[] unixBytes = unixContent.getBytes();
185+
186+
String windowsContent = "line1\r\n\r\n\r\nline2\r\n";
187+
byte[] windowsBytes = windowsContent.getBytes();
188+
String expectedHash = org.apache.commons.codec.digest.DigestUtils.md5Hex(windowsBytes);
189+
190+
String result = WinnowingUtils.calculateOppositeLineEndingHash(unixBytes);
191+
assertEquals(expectedHash, result);
192+
}
87193
}

0 commit comments

Comments
 (0)