|
22 | 22 | */ |
23 | 23 | package com.scanoss.utils; |
24 | 24 |
|
| 25 | +import lombok.AllArgsConstructor; |
| 26 | +import lombok.Getter; |
25 | 27 | import lombok.NonNull; |
| 28 | +import org.apache.commons.codec.digest.DigestUtils; |
26 | 29 |
|
| 30 | +import java.io.ByteArrayOutputStream; |
27 | 31 | import java.util.HashSet; |
28 | 32 | import java.util.Set; |
29 | 33 | import java.util.regex.Matcher; |
|
37 | 41 | */ |
38 | 42 | public class WinnowingUtils { |
39 | 43 |
|
| 44 | + /** |
| 45 | + * Inner class to hold line ending detection results. |
| 46 | + */ |
| 47 | + @Getter |
| 48 | + @AllArgsConstructor |
| 49 | + public static class LineEndingInfo { |
| 50 | + private final boolean hasCrlf; |
| 51 | + private final boolean hasStandaloneLf; |
| 52 | + private final boolean hasStandaloneCr; |
| 53 | + } |
| 54 | + |
40 | 55 | /** |
41 | 56 | * Normalise the given character |
42 | 57 | * |
@@ -95,4 +110,132 @@ public static Set<String> extractFilePathsFromWFPBlock(@NonNull String wfpBlock) |
95 | 110 |
|
96 | 111 | return paths; |
97 | 112 | } |
| 113 | + |
| 114 | + /** |
| 115 | + * Calculate hash for contents with opposite line endings. |
| 116 | + * If the file is primarily Unix (LF), calculates Windows (CRLF) hash. |
| 117 | + * If the file is primarily Windows (CRLF), calculates Unix (LF) hash. |
| 118 | + * |
| 119 | + * @param contents File contents as bytes |
| 120 | + * @return Hash with opposite line endings as hex string, or null if no line endings detected |
| 121 | + */ |
| 122 | + public static String calculateOppositeLineEndingHash(byte[] contents) { |
| 123 | + LineEndingInfo lineEndingInfo = detectLineEndings(contents); |
| 124 | + |
| 125 | + // If no line endings detected, return null |
| 126 | + if (!lineEndingInfo.hasCrlf && !lineEndingInfo.hasStandaloneLf && !lineEndingInfo.hasStandaloneCr) { |
| 127 | + return null; |
| 128 | + } |
| 129 | + |
| 130 | + // Normalize all line endings to LF first |
| 131 | + byte[] normalized = replaceSequence(contents, new byte[]{'\r', '\n'}, new byte[]{'\n'}); |
| 132 | + normalized = replaceSequence(normalized, new byte[]{'\r'}, new byte[]{'\n'}); |
| 133 | + |
| 134 | + byte[] oppositeContents; |
| 135 | + |
| 136 | + // Determine the dominant line ending type |
| 137 | + if (lineEndingInfo.hasCrlf && !lineEndingInfo.hasStandaloneLf && !lineEndingInfo.hasStandaloneCr) { |
| 138 | + // File is Windows (CRLF) - produce Unix (LF) hash |
| 139 | + oppositeContents = normalized; |
| 140 | + } else { |
| 141 | + // File is Unix (LF/CR) or mixed - produce Windows (CRLF) hash |
| 142 | + oppositeContents = replaceSequence(normalized, new byte[]{'\n'}, new byte[]{'\r', '\n'}); |
| 143 | + } |
| 144 | + |
| 145 | + return DigestUtils.md5Hex(oppositeContents); |
| 146 | + } |
| 147 | + |
| 148 | + /** |
| 149 | + * Detect the types of line endings present in file contents. |
| 150 | + * |
| 151 | + * @param contents File contents as bytes |
| 152 | + * @return LineEndingInfo indicating which line ending types are present |
| 153 | + */ |
| 154 | + private static LineEndingInfo detectLineEndings(byte[] contents) { |
| 155 | + // Check for CRLF (Windows line endings) |
| 156 | + boolean hasCrlf = containsSequence(contents, new byte[]{'\r', '\n'}); |
| 157 | + |
| 158 | + // Remove all CRLF sequences to check for standalone LF and CR |
| 159 | + byte[] contentWithoutCrlf = replaceSequence(contents, new byte[]{'\r', '\n'}, new byte[]{}); |
| 160 | + |
| 161 | + // Check for standalone LF (not part of CRLF) |
| 162 | + boolean hasStandaloneLf = containsSequence(contentWithoutCrlf, new byte[]{'\n'}); |
| 163 | + |
| 164 | + // Check for standalone CR (not part of CRLF) |
| 165 | + boolean hasStandaloneCr = containsSequence(contentWithoutCrlf, new byte[]{'\r'}); |
| 166 | + |
| 167 | + return new LineEndingInfo(hasCrlf, hasStandaloneLf, hasStandaloneCr); |
| 168 | + } |
| 169 | + |
| 170 | + /** |
| 171 | + * Check if a byte array contains a specific sequence of bytes. |
| 172 | + * |
| 173 | + * @param data The byte array to search in |
| 174 | + * @param sequence The sequence to search for |
| 175 | + * @return true if the sequence is found, false otherwise |
| 176 | + */ |
| 177 | + private static boolean containsSequence(byte[] data, byte[] sequence) { |
| 178 | + if (sequence.length == 0 || data.length < sequence.length) { |
| 179 | + return false; |
| 180 | + } |
| 181 | + |
| 182 | + for (int i = 0; i <= data.length - sequence.length; i++) { |
| 183 | + boolean found = true; |
| 184 | + for (int j = 0; j < sequence.length; j++) { |
| 185 | + if (data[i + j] != sequence[j]) { |
| 186 | + found = false; |
| 187 | + break; |
| 188 | + } |
| 189 | + } |
| 190 | + if (found) { |
| 191 | + return true; |
| 192 | + } |
| 193 | + } |
| 194 | + return false; |
| 195 | + } |
| 196 | + |
| 197 | + /** |
| 198 | + * Replace all occurrences of a byte sequence with another sequence. |
| 199 | + * Uses ByteArrayOutputStream for better performance compared to List<Byte>. |
| 200 | + * |
| 201 | + * @param data The original byte array |
| 202 | + * @param search The sequence to search for |
| 203 | + * @param replacement The sequence to replace with |
| 204 | + * @return A new byte array with replacements made |
| 205 | + */ |
| 206 | + private static byte[] replaceSequence(byte[] data, byte[] search, byte[] replacement) { |
| 207 | + if (search.length == 0) { |
| 208 | + return data; |
| 209 | + } |
| 210 | + |
| 211 | + ByteArrayOutputStream result = new ByteArrayOutputStream(data.length); |
| 212 | + int i = 0; |
| 213 | + |
| 214 | + while (i < data.length) { |
| 215 | + boolean found = false; |
| 216 | + |
| 217 | + // Check if we have a match at current position |
| 218 | + if (i <= data.length - search.length) { |
| 219 | + found = true; |
| 220 | + for (int j = 0; j < search.length; j++) { |
| 221 | + if (data[i + j] != search[j]) { |
| 222 | + found = false; |
| 223 | + break; |
| 224 | + } |
| 225 | + } |
| 226 | + } |
| 227 | + |
| 228 | + if (found) { |
| 229 | + // Add replacement bytes |
| 230 | + result.write(replacement, 0, replacement.length); |
| 231 | + i += search.length; |
| 232 | + } else { |
| 233 | + // Add current byte |
| 234 | + result.write(data[i]); |
| 235 | + i++; |
| 236 | + } |
| 237 | + } |
| 238 | + |
| 239 | + return result.toByteArray(); |
| 240 | + } |
98 | 241 | } |
0 commit comments