chore:SP-3579 Implements fh2 opposite line ending hash calculation

agustingroh · agustingroh · commit c0b79d38d8bb · 2025-11-03T12:09:26.000-03:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 
 - Upcoming changes...
+### Added
+- `calculateOppositeLineEndingHash()` method in `WinnowingUtils` to compute hash with opposite line endings (Unix ↔ Windows)
+- FH2 hash included in WFP output format as `fh2=<hash>`
+- Support for detecting CRLF (Windows), LF (Unix), and CR (legacy Mac) line endings
 
 ## [0.11.0] - 2025-05-26
 ### Added
diff --git a/README.md b/README.md
@@ -49,7 +49,7 @@ public class Test {
 The package also ships with a sample CLI. It can be run using the example script [scanoss-cli.sh](scanoss-cli.sh):
 
 ```bash
-scanos-cli.sh -h
+scanoss-cli.sh -h
 ```
 
 ### Custom Certificate
@@ -91,7 +91,7 @@ packaging/releasing an update.
 The following commands are provided for incrementing version:
 
 ```bash
-make inc_path
+make inc_patch
 make inc_minor
 make inc_major
 ```
diff --git a/pom.xml b/pom.xml
@@ -6,7 +6,7 @@
 
     <groupId>com.scanoss</groupId>
     <artifactId>scanoss</artifactId>
-    <version>0.11.0</version>
+    <version>0.12.0</version>
     <packaging>jar</packaging>
     <name>scanoss.java</name>
     <url>https://github.com/scanoss/scanoss.java</url>
diff --git a/src/main/java/com/scanoss/Winnowing.java b/src/main/java/com/scanoss/Winnowing.java
@@ -158,6 +158,11 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
             return wfpBuilder.toString();
         }
 
+        String fh2 = WinnowingUtils.calculateOppositeLineEndingHash(contents);
+        if (fh2 != null){
+            wfpBuilder.append(String.format("fh2=%s\n",fh2));
+        }
+
         if(this.isHpsm()){
             wfpBuilder.append(String.format("hpsm=%s\n", Hpsm.calcHpsm(contents)));
         }
diff --git a/src/main/java/com/scanoss/utils/WinnowingUtils.java b/src/main/java/com/scanoss/utils/WinnowingUtils.java
@@ -22,8 +22,12 @@
  */
 package com.scanoss.utils;
 
+import lombok.AllArgsConstructor;
+import lombok.Getter;
 import lombok.NonNull;
+import org.apache.commons.codec.digest.DigestUtils;
 
+import java.io.ByteArrayOutputStream;
 import java.util.HashSet;
 import java.util.Set;
 import java.util.regex.Matcher;
@@ -37,6 +41,17 @@
  */
 public class WinnowingUtils {
 
+    /**
+     * Inner class to hold line ending detection results.
+     */
+    @Getter
+    @AllArgsConstructor
+    public static class LineEndingInfo {
+        private final boolean hasCrlf;
+        private final boolean hasStandaloneLf;
+        private final boolean hasStandaloneCr;
+    }
+
     /**
      * Normalise the given character
      *
@@ -95,4 +110,132 @@ public static Set<String> extractFilePathsFromWFPBlock(@NonNull String wfpBlock)
 
         return paths;
     }
+
+    /**
+     * Calculate hash for contents with opposite line endings.
+     * If the file is primarily Unix (LF), calculates Windows (CRLF) hash.
+     * If the file is primarily Windows (CRLF), calculates Unix (LF) hash.
+     *
+     * @param contents File contents as bytes
+     * @return Hash with opposite line endings as hex string, or null if no line endings detected
+     */
+    public static String calculateOppositeLineEndingHash(byte[] contents) {
+        LineEndingInfo lineEndingInfo = detectLineEndings(contents);
+
+        // If no line endings detected, return null
+        if (!lineEndingInfo.hasCrlf && !lineEndingInfo.hasStandaloneLf && !lineEndingInfo.hasStandaloneCr) {
+            return null;
+        }
+
+        // Normalize all line endings to LF first
+        byte[] normalized = replaceSequence(contents, new byte[]{'\r', '\n'}, new byte[]{'\n'});
+        normalized = replaceSequence(normalized, new byte[]{'\r'}, new byte[]{'\n'});
+
+        byte[] oppositeContents;
+
+        // Determine the dominant line ending type
+        if (lineEndingInfo.hasCrlf && !lineEndingInfo.hasStandaloneLf && !lineEndingInfo.hasStandaloneCr) {
+            // File is Windows (CRLF) - produce Unix (LF) hash
+            oppositeContents = normalized;
+        } else {
+            // File is Unix (LF/CR) or mixed - produce Windows (CRLF) hash
+            oppositeContents = replaceSequence(normalized, new byte[]{'\n'}, new byte[]{'\r', '\n'});
+        }
+
+        return DigestUtils.md5Hex(oppositeContents);
+    }
+
+    /**
+     * Detect the types of line endings present in file contents.
+     *
+     * @param contents File contents as bytes
+     * @return LineEndingInfo indicating which line ending types are present
+     */
+    private static LineEndingInfo detectLineEndings(byte[] contents) {
+        // Check for CRLF (Windows line endings)
+        boolean hasCrlf = containsSequence(contents, new byte[]{'\r', '\n'});
+
+        // Remove all CRLF sequences to check for standalone LF and CR
+        byte[] contentWithoutCrlf = replaceSequence(contents, new byte[]{'\r', '\n'}, new byte[]{});
+
+        // Check for standalone LF (not part of CRLF)
+        boolean hasStandaloneLf = containsSequence(contentWithoutCrlf, new byte[]{'\n'});
+
+        // Check for standalone CR (not part of CRLF)
+        boolean hasStandaloneCr = containsSequence(contentWithoutCrlf, new byte[]{'\r'});
+
+        return new LineEndingInfo(hasCrlf, hasStandaloneLf, hasStandaloneCr);
+    }
+
+    /**
+     * Check if a byte array contains a specific sequence of bytes.
+     *
+     * @param data     The byte array to search in
+     * @param sequence The sequence to search for
+     * @return true if the sequence is found, false otherwise
+     */
+    private static boolean containsSequence(byte[] data, byte[] sequence) {
+        if (sequence.length == 0 || data.length < sequence.length) {
+            return false;
+        }
+
+        for (int i = 0; i <= data.length - sequence.length; i++) {
+            boolean found = true;
+            for (int j = 0; j < sequence.length; j++) {
+                if (data[i + j] != sequence[j]) {
+                    found = false;
+                    break;
+                }
+            }
+            if (found) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    /**
+     * Replace all occurrences of a byte sequence with another sequence.
+     * Uses ByteArrayOutputStream for better performance compared to List<Byte>.
+     *
+     * @param data        The original byte array
+     * @param search      The sequence to search for
+     * @param replacement The sequence to replace with
+     * @return A new byte array with replacements made
+     */
+    private static byte[] replaceSequence(byte[] data, byte[] search, byte[] replacement) {
+        if (search.length == 0) {
+            return data;
+        }
+
+        ByteArrayOutputStream result = new ByteArrayOutputStream(data.length);
+        int i = 0;
+
+        while (i < data.length) {
+            boolean found = false;
+
+            // Check if we have a match at current position
+            if (i <= data.length - search.length) {
+                found = true;
+                for (int j = 0; j < search.length; j++) {
+                    if (data[i + j] != search[j]) {
+                        found = false;
+                        break;
+                    }
+                }
+            }
+
+            if (found) {
+                // Add replacement bytes
+                result.write(replacement, 0, replacement.length);
+                i += search.length;
+            } else {
+                // Add current byte
+                result.write(data[i]);
+                i++;
+            }
+        }
+
+        return result.toByteArray();
+    }
 }
diff --git a/src/test/java/com/scanoss/TestWinnowing.java b/src/test/java/com/scanoss/TestWinnowing.java
@@ -105,6 +105,7 @@ public void TestWinnowingContentsHPSM() {
         assertNotNull(wfp);
         assertFalse(wfp.isEmpty());
         assertEquals("file=609a24b6cd27ef8108792ca459db1b28,293,local-file.c\n" +
+                "fh2=0bd0edfa2f3d4903c51b9fd910409942\n" +
                 "hpsm=df13c104d4\n" +
                 "3=0ed5027a,a9442399,d019b836\n" +
                 "4=613d56c0\n" +
diff --git a/src/test/java/com/scanoss/utils/WinnowingUtilsTest.java b/src/test/java/com/scanoss/utils/WinnowingUtilsTest.java
@@ -84,4 +84,110 @@ public void testExtractFilePathsFromWFPBlock_ComplexCase_HandlesCorrectly() {
         assertTrue(result.contains("/path/to/file2"));
         assertTrue(result.contains("/path/to/file3"));
     }
+
+    // Tests for calculateOppositeLineEndingHash
+    @Test
+    public void testCalculateOppositeLineEndingHash_UnixToWindows_ReturnsWindowsHash() {
+        // Unix file with LF line endings
+        String unixContent = "line1\nline2\nline3\n";
+        byte[] unixBytes = unixContent.getBytes();
+
+        // Expected: Windows content with CRLF
+        String windowsContent = "line1\r\nline2\r\nline3\r\n";
+        byte[] windowsBytes = windowsContent.getBytes();
+        String expectedHash = org.apache.commons.codec.digest.DigestUtils.md5Hex(windowsBytes);
+
+        String result = WinnowingUtils.calculateOppositeLineEndingHash(unixBytes);
+        assertEquals(expectedHash, result);
+    }
+
+    @Test
+    public void testCalculateOppositeLineEndingHash_WindowsToUnix_ReturnsUnixHash() {
+        // Windows file with CRLF line endings
+        String windowsContent = "line1\r\nline2\r\nline3\r\n";
+        byte[] windowsBytes = windowsContent.getBytes();
+
+        // Expected: Unix content with LF
+        String unixContent = "line1\nline2\nline3\n";
+        byte[] unixBytes = unixContent.getBytes();
+        String expectedHash = org.apache.commons.codec.digest.DigestUtils.md5Hex(unixBytes);
+
+        String result = WinnowingUtils.calculateOppositeLineEndingHash(windowsBytes);
+        assertEquals(expectedHash, result);
+    }
+
+    @Test
+    public void testCalculateOppositeLineEndingHash_NoLineEndings_ReturnsNull() {
+        // Content without any line endings
+        String content = "single line with no line endings";
+        byte[] bytes = content.getBytes();
+
+        String result = WinnowingUtils.calculateOppositeLineEndingHash(bytes);
+        assertNull(result);
+    }
+
+    @Test
+    public void testCalculateOppositeLineEndingHash_EmptyContent_ReturnsNull() {
+        byte[] emptyBytes = new byte[0];
+
+        String result = WinnowingUtils.calculateOppositeLineEndingHash(emptyBytes);
+        assertNull(result);
+    }
+
+    @Test
+    public void testCalculateOppositeLineEndingHash_MixedLineEndings_ReturnsWindowsHash() {
+        // Mixed line endings (LF and CRLF) - should produce Windows hash
+        String mixedContent = "line1\nline2\r\nline3\n";
+        byte[] mixedBytes = mixedContent.getBytes();
+
+        // Expected: all normalized to Windows CRLF
+        String windowsContent = "line1\r\nline2\r\nline3\r\n";
+        byte[] windowsBytes = windowsContent.getBytes();
+        String expectedHash = org.apache.commons.codec.digest.DigestUtils.md5Hex(windowsBytes);
+
+        String result = WinnowingUtils.calculateOppositeLineEndingHash(mixedBytes);
+        assertEquals(expectedHash, result);
+    }
+
+    @Test
+    public void testCalculateOppositeLineEndingHash_OnlyCarriageReturn_ReturnsWindowsHash() {
+        // Old Mac-style CR line endings
+        String crContent = "line1\rline2\rline3\r";
+        byte[] crBytes = crContent.getBytes();
+
+        // Expected: Windows CRLF
+        String windowsContent = "line1\r\nline2\r\nline3\r\n";
+        byte[] windowsBytes = windowsContent.getBytes();
+        String expectedHash = org.apache.commons.codec.digest.DigestUtils.md5Hex(windowsBytes);
+
+        String result = WinnowingUtils.calculateOppositeLineEndingHash(crBytes);
+        assertEquals(expectedHash, result);
+    }
+
+    @Test
+    public void testCalculateOppositeLineEndingHash_SingleLineWithLF_ReturnsWindowsHash() {
+        String unixContent = "single line\n";
+        byte[] unixBytes = unixContent.getBytes();
+
+        String windowsContent = "single line\r\n";
+        byte[] windowsBytes = windowsContent.getBytes();
+        String expectedHash = org.apache.commons.codec.digest.DigestUtils.md5Hex(windowsBytes);
+
+        String result = WinnowingUtils.calculateOppositeLineEndingHash(unixBytes);
+        assertEquals(expectedHash, result);
+    }
+
+    @Test
+    public void testCalculateOppositeLineEndingHash_MultipleConsecutiveLineEndings_HandlesCorrectly() {
+        // Multiple consecutive line endings (blank lines)
+        String unixContent = "line1\n\n\nline2\n";
+        byte[] unixBytes = unixContent.getBytes();
+
+        String windowsContent = "line1\r\n\r\n\r\nline2\r\n";
+        byte[] windowsBytes = windowsContent.getBytes();
+        String expectedHash = org.apache.commons.codec.digest.DigestUtils.md5Hex(windowsBytes);
+
+        String result = WinnowingUtils.calculateOppositeLineEndingHash(unixBytes);
+        assertEquals(expectedHash, result);
+    }
 }

Original file line number	Diff line number	Diff line change
`@@ -158,6 +158,11 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c`
`158`	`158`	`return wfpBuilder.toString();`
`159`	`159`	`}`
`160`	`160`
	`161`	`+ String fh2 = WinnowingUtils.calculateOppositeLineEndingHash(contents);`
	`162`	`+ if (fh2 != null){`
	`163`	`+ wfpBuilder.append(String.format("fh2=%s\n",fh2));`
	`164`	`+ }`
	`165`	`+`
`161`	`166`	`if(this.isHpsm()){`
`162`	`167`	`wfpBuilder.append(String.format("hpsm=%s\n", Hpsm.calcHpsm(contents)));`
`163`	`168`	`}`