From f217f12295b8eeaaefaca0bb6313ed2cdd95b0a3 Mon Sep 17 00:00:00 2001 From: tejaskriya Date: Tue, 25 Mar 2025 18:44:09 +0530 Subject: [PATCH 01/13] HDDS-10822. Tool to omit raft log in OM --- .../ozone/repair/om/OMRatisLogRepair.java | 173 ++++++++++++++++++ .../hadoop/ozone/repair/om/OMRepair.java | 3 +- 2 files changed, 175 insertions(+), 1 deletion(-) create mode 100644 hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java diff --git a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java new file mode 100644 index 000000000000..dac38fc3f94f --- /dev/null +++ b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java @@ -0,0 +1,173 @@ +package org.apache.hadoop.ozone.repair.om; + +import static org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.Type.EchoRPC; + +import java.io.File; +import java.io.IOException; +import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; +import java.nio.ByteBuffer; +import org.apache.hadoop.hdds.cli.HddsVersionProvider; +import org.apache.hadoop.ozone.om.helpers.OMRatisHelper; +import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos; +import org.apache.hadoop.ozone.repair.RepairTool; +import org.apache.ratis.proto.RaftProtos; +import org.apache.ratis.server.metrics.SegmentedRaftLogMetrics; +import org.apache.ratis.server.raftlog.segmented.LogSegmentPath; +import org.apache.ratis.server.raftlog.segmented.LogSegmentStartEnd; +import org.apache.ratis.server.raftlog.segmented.SegmentedRaftLogInputStream; +import org.apache.ratis.server.raftlog.segmented.SegmentedRaftLogOutputStream; +import org.apache.ratis.util.Preconditions; +import org.apache.ratis.util.SizeInBytes; +import picocli.CommandLine; + + +/** + * Tool to omit a raft log in a ratis segment file. + */ +@CommandLine.Command( + name = "ratislogrepair", + description = "CLI to omit a raft log in a ratis segment file. " + + "Requires admin privileges.", + mixinStandardHelpOptions = true, + versionProvider = HddsVersionProvider.class +) +public class OMRatisLogRepair extends RepairTool { + + @CommandLine.Option(names = {"-s", "--segment-path", "--segmentPath"}, + required = true, + description = "Path of the input segment file") + private File segmentFile; + @CommandLine.Option(names = {"-o", "--output-path", "--outputPath"}, + required = true, + description = "Path of the repaired segment file") + private String outputPath; + + @CommandLine.Option(names = {"--index"}, + required = true, + description = "Path of the segment file") + private long index; + + @Override + public void execute() throws Exception { + + LogSegmentPath pi = LogSegmentPath.matchLogSegment(this.segmentFile.toPath()); + if (pi == null) { + error("Invalid segment file"); + throw new RuntimeException("Invalid Segment File"); + } + info("Processing Raft Log file: " + this.segmentFile.getAbsolutePath() + " size:" + this.segmentFile.length()); + SegmentedRaftLogOutputStream outputStream = null; + SegmentedRaftLogInputStream logInputStream = null; + + try { + logInputStream = getInputStream(pi); + File outputFile = createOutputFile(outputPath); + outputStream = new SegmentedRaftLogOutputStream(outputFile, false, + 1024, 1024, ByteBuffer.allocateDirect(10 * 1024)); + + RaftProtos.LogEntryProto next; + for (RaftProtos.LogEntryProto prev = null; (next = logInputStream.nextEntry()) != null; prev = next) { + if (prev != null) { + Preconditions.assertTrue(next.getIndex() == prev.getIndex() + 1L, + "gap between entry %s and entry %s", prev, next); + } + + if (next.getIndex() != index) { + // all other logs will be written as it is + outputStream.write(next); + } else { + // replace the transaction with a dummy OmEcho operation + OzoneManagerProtocolProtos.OMRequest oldRequest = OMRatisHelper + .convertByteStringToOMRequest(next.getStateMachineLogEntry().getLogData()); + OzoneManagerProtocolProtos.OMRequest.Builder newRequest = OzoneManagerProtocolProtos.OMRequest.newBuilder() + .setCmdType(EchoRPC) + .setClientId(oldRequest.getClientId()) + .setEchoRPCRequest(OzoneManagerProtocolProtos.EchoRPCRequest.newBuilder().build()); + + if (oldRequest.hasUserInfo()) { + newRequest.setUserInfo(oldRequest.getUserInfo()); + } + if (oldRequest.hasTraceID()) { + newRequest.setTraceID(oldRequest.getTraceID()); + } + if (oldRequest.hasLayoutVersion()) { + newRequest.setLayoutVersion(oldRequest.getLayoutVersion()); + } + if (oldRequest.hasVersion()) { + newRequest.setVersion(oldRequest.getVersion()); + } + + RaftProtos.StateMachineLogEntryProto oldEntry = next.getStateMachineLogEntry(); + RaftProtos.StateMachineLogEntryProto.Builder newEntry = + RaftProtos.StateMachineLogEntryProto.newBuilder() + .setCallId(oldEntry.getCallId()) + .setClientId(oldEntry.getClientId()) + .setType(oldEntry.getType()) + .setLogData(OMRatisHelper.convertRequestToByteString(newRequest.build())); + if (oldEntry.hasStateMachineEntry()) { + newEntry.setStateMachineEntry(oldEntry.getStateMachineEntry()); + } + + RaftProtos.LogEntryProto newLogEntry = RaftProtos.LogEntryProto.newBuilder() + .setTerm(next.getTerm()) + .setIndex(next.getIndex()) + .setStateMachineLogEntry(newEntry) + .build(); + outputStream.write(newLogEntry); + } + } + } catch (Exception ex) { + error("Exception: " + ex); + } finally { + if (logInputStream != null) { + logInputStream.close(); + } + if (outputStream != null) { + outputStream.flush(); + outputStream.close(); + } + } + } + + private File createOutputFile(String name) throws IOException { + File temp = new File(name); + try { + if (temp.exists()) { + throw new IOException("Error: File already exists - " + temp.getAbsolutePath()); + } else { + if (temp.createNewFile()) { + System.out.println("File created successfully: " + temp.getAbsolutePath()); + } else { + throw new IOException("Error: Failed to create file - " + temp.getAbsolutePath()); + } + } + } catch (IOException e) { + error("Exception while trying to open output file: " + e.getMessage()); + throw e; + } + return temp; + } + + private SegmentedRaftLogInputStream getInputStream(LogSegmentPath pi) { + try { + Class logInputStreamClass = + Class.forName("org.apache.ratis.server.raftlog.segmented.SegmentedRaftLogInputStream"); + Constructor constructor = logInputStreamClass.getDeclaredConstructor(File.class, LogSegmentStartEnd.class, + SizeInBytes.class, SegmentedRaftLogMetrics.class); + constructor.setAccessible(true); + SegmentedRaftLogInputStream inputStream = + (SegmentedRaftLogInputStream) constructor.newInstance(segmentFile, pi.getStartEnd(), + SizeInBytes.valueOf("32MB"), null); + if (inputStream == null) { + throw new RuntimeException("logInputStream is null. Constructor might have failed."); + } + return inputStream; + + } catch (ClassNotFoundException | NoSuchMethodException | SecurityException | + InvocationTargetException | InstantiationException | IllegalAccessException ex) { + error("Exception while trying to get input stream for segment file : " + ex); + throw new RuntimeException(ex); + } + } +} diff --git a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRepair.java b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRepair.java index cac844be882a..7232ccf2aae8 100644 --- a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRepair.java +++ b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRepair.java @@ -32,7 +32,8 @@ SnapshotRepair.class, TransactionInfoRepair.class, QuotaRepair.class, - CompactOMDB.class + CompactOMDB.class, + OMRatisLogRepair.class }, description = "Operational tool to repair OM.") @MetaInfServices(RepairSubcommand.class) From f69fd69e26aec51689a65cbf6fa42e98718dbd09 Mon Sep 17 00:00:00 2001 From: tejaskriya Date: Tue, 25 Mar 2025 19:35:42 +0530 Subject: [PATCH 02/13] Checkstyle fix --- .../ozone/repair/om/OMRatisLogRepair.java | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java index dac38fc3f94f..96a4f33bf229 100644 --- a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java +++ b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.hadoop.ozone.repair.om; import static org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.Type.EchoRPC; From ac03b2b4e8647259851620ed8b9804b50417c9ce Mon Sep 17 00:00:00 2001 From: tejaskriya Date: Mon, 7 Apr 2025 13:40:43 +0530 Subject: [PATCH 03/13] Take backup and replace original with repaired file --- .../ozone/repair/om/OMRatisLogRepair.java | 71 +++++++++++++++---- 1 file changed, 58 insertions(+), 13 deletions(-) diff --git a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java index 96a4f33bf229..f0c50c01823a 100644 --- a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java +++ b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java @@ -24,6 +24,8 @@ import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.StandardCopyOption; import org.apache.hadoop.hdds.cli.HddsVersionProvider; import org.apache.hadoop.ozone.om.helpers.OMRatisHelper; import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos; @@ -43,9 +45,14 @@ * Tool to omit a raft log in a ratis segment file. */ @CommandLine.Command( - name = "ratislogrepair", - description = "CLI to omit a raft log in a ratis segment file. " + - "Requires admin privileges.", + name = "skip-ratis-transaction", + aliases = "srt", + description = "CLI to omit a raft log in a ratis segment file. The raft log at the index specified " + + "is replaced with an EchoOM command (which is a dummy command). It is an offline command " + + "i.e., doesn't require OM to be running. " + + "The command should be run for the same transaction on all 3 OMs only when they all OMs are crashing " + + "while applying the same transaction. If there is only one OM that is crashing, " + + "then the DB should manually copied from one of the good OMs to the crashing OM instead.", mixinStandardHelpOptions = true, versionProvider = HddsVersionProvider.class ) @@ -55,33 +62,55 @@ public class OMRatisLogRepair extends RepairTool { required = true, description = "Path of the input segment file") private File segmentFile; - @CommandLine.Option(names = {"-o", "--output-path", "--outputPath"}, + @CommandLine.Option(names = {"-b", "--backup"}, required = true, - description = "Path of the repaired segment file") - private String outputPath; + description = "Path to put the backup of the original repaired segment file") + private File backupPath; @CommandLine.Option(names = {"--index"}, required = true, - description = "Path of the segment file") + description = "Index of the failing transaction that should be removed") private long index; @Override public void execute() throws Exception { + info("Taking back up of Raft Log file: " + this.segmentFile.getAbsolutePath() + " to location: " + backupPath); + if (!segmentFile.exists()) { + error("Error: Source segment file \"" + segmentFile + "\" does not exist."); + return; + } + if (backupPath.exists()) { + error("Error: Backup file for segment file \"" + backupPath + "\" already exists."); + return; + } + try { + if (!isDryRun()) { + Files.copy(segmentFile.toPath(), backupPath.toPath()); + } + System.out.println("File renamed successfully!"); + } catch (IOException ex) { + throw new RuntimeException("Error: Failed to take backup of the file. " + + "It might be due to file locks or permission issues."); + } + LogSegmentPath pi = LogSegmentPath.matchLogSegment(this.segmentFile.toPath()); if (pi == null) { - error("Invalid segment file"); throw new RuntimeException("Invalid Segment File"); } + String tempOutput = segmentFile.getAbsolutePath() + ".skr.output"; + File outputFile = createOutputFile(tempOutput); + info("Processing Raft Log file: " + this.segmentFile.getAbsolutePath() + " size:" + this.segmentFile.length()); SegmentedRaftLogOutputStream outputStream = null; SegmentedRaftLogInputStream logInputStream = null; try { logInputStream = getInputStream(pi); - File outputFile = createOutputFile(outputPath); - outputStream = new SegmentedRaftLogOutputStream(outputFile, false, - 1024, 1024, ByteBuffer.allocateDirect(10 * 1024)); + if (!isDryRun()) { + outputStream = new SegmentedRaftLogOutputStream(outputFile, false, + 1024, 1024, ByteBuffer.allocateDirect(10 * 1024)); + } RaftProtos.LogEntryProto next; for (RaftProtos.LogEntryProto prev = null; (next = logInputStream.nextEntry()) != null; prev = next) { @@ -90,7 +119,7 @@ public void execute() throws Exception { "gap between entry %s and entry %s", prev, next); } - if (next.getIndex() != index) { + if (next.getIndex() != index && !isDryRun()) { // all other logs will be written as it is outputStream.write(next); } else { @@ -131,9 +160,22 @@ public void execute() throws Exception { .setIndex(next.getIndex()) .setStateMachineLogEntry(newEntry) .build(); - outputStream.write(newLogEntry); + + info("Replacing {" + next.getStateMachineLogEntry().getLogData() + "} with EchoRPC command at index " + + next.getIndex()); + + if (!isDryRun()) { + outputStream.write(newLogEntry); + } } } + + if (!isDryRun()) { + outputStream.flush(); + outputStream.close(); + Files.move(outputFile.toPath(), segmentFile.toPath(), StandardCopyOption.REPLACE_EXISTING); + } + } catch (Exception ex) { error("Exception: " + ex); } finally { @@ -144,6 +186,9 @@ public void execute() throws Exception { outputStream.flush(); outputStream.close(); } + if (isDryRun()) { + outputFile.delete(); + } } } From e4e11a7e97a4fcc37065a98346a09ed8073dbc32 Mon Sep 17 00:00:00 2001 From: tejaskriya Date: Mon, 7 Apr 2025 13:59:29 +0530 Subject: [PATCH 04/13] Improve error messages --- .../org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java index f0c50c01823a..75b247ddb936 100644 --- a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java +++ b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java @@ -88,7 +88,7 @@ public void execute() throws Exception { if (!isDryRun()) { Files.copy(segmentFile.toPath(), backupPath.toPath()); } - System.out.println("File renamed successfully!"); + System.out.println("File backed-up successfully!"); } catch (IOException ex) { throw new RuntimeException("Error: Failed to take backup of the file. " + "It might be due to file locks or permission issues."); @@ -96,6 +96,8 @@ public void execute() throws Exception { LogSegmentPath pi = LogSegmentPath.matchLogSegment(this.segmentFile.toPath()); if (pi == null) { + error("Deleting backup file as provided segmentFile is invalid."); + backupPath.delete(); throw new RuntimeException("Invalid Segment File"); } String tempOutput = segmentFile.getAbsolutePath() + ".skr.output"; @@ -161,7 +163,7 @@ public void execute() throws Exception { .setStateMachineLogEntry(newEntry) .build(); - info("Replacing {" + next.getStateMachineLogEntry().getLogData() + "} with EchoRPC command at index " + info("Replacing {" + oldRequest + "} with EchoRPC command at index " + next.getIndex()); if (!isDryRun()) { From 935147d438161c90ada4ea47650e36c31aa126b2 Mon Sep 17 00:00:00 2001 From: tejaskriya Date: Tue, 8 Apr 2025 14:10:03 +0530 Subject: [PATCH 05/13] support passing ratis directory --- .../ozone/repair/om/OMRatisLogRepair.java | 110 +++++++++++++----- 1 file changed, 81 insertions(+), 29 deletions(-) diff --git a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java index 75b247ddb936..562ef23fb944 100644 --- a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java +++ b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java @@ -26,6 +26,8 @@ import java.nio.ByteBuffer; import java.nio.file.Files; import java.nio.file.StandardCopyOption; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.hadoop.hdds.cli.HddsVersionProvider; import org.apache.hadoop.ozone.om.helpers.OMRatisHelper; import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos; @@ -51,17 +53,22 @@ "is replaced with an EchoOM command (which is a dummy command). It is an offline command " + "i.e., doesn't require OM to be running. " + "The command should be run for the same transaction on all 3 OMs only when they all OMs are crashing " + - "while applying the same transaction. If there is only one OM that is crashing, " + - "then the DB should manually copied from one of the good OMs to the crashing OM instead.", + "while applying the same transaction. If there is only one OM that is crashing and " + + "other OMs have executed the log successfully, then the DB should manually copied from one of the good OMs " + + "to the crashing OM instead.", mixinStandardHelpOptions = true, versionProvider = HddsVersionProvider.class ) public class OMRatisLogRepair extends RepairTool { @CommandLine.Option(names = {"-s", "--segment-path", "--segmentPath"}, - required = true, description = "Path of the input segment file") private File segmentFile; + + @CommandLine.Option(names = {"-d", "--ratis-log-dir", "--ratisLogDir"}, + description = "Path of the ratis log directory") + private File logDir; + @CommandLine.Option(names = {"-b", "--backup"}, required = true, description = "Path to put the backup of the original repaired segment file") @@ -75,31 +82,34 @@ public class OMRatisLogRepair extends RepairTool { @Override public void execute() throws Exception { - info("Taking back up of Raft Log file: " + this.segmentFile.getAbsolutePath() + " to location: " + backupPath); + if (segmentFile == null && logDir == null) { + throw new IllegalArgumentException("Path to either a segment-file or ratis-log-dir must be provided."); + } + if (segmentFile == null) { + segmentFile = findSegmentFileContainingIndex(); + } + + LogSegmentPath pi = LogSegmentPath.matchLogSegment(this.segmentFile.toPath()); + if (pi == null) { + throw new IOException("Invalid Segment File"); + } + if (!segmentFile.exists()) { - error("Error: Source segment file \"" + segmentFile + "\" does not exist."); - return; + throw new IOException("Error: Source segment file \"" + segmentFile + "\" does not exist."); } if (backupPath.exists()) { - error("Error: Backup file for segment file \"" + backupPath + "\" already exists."); - return; + throw new IOException("Error: Backup file for segment file \"" + backupPath + "\" already exists."); } try { + info("Taking back up of Raft Log file: " + this.segmentFile.getAbsolutePath() + " to location: " + backupPath); if (!isDryRun()) { Files.copy(segmentFile.toPath(), backupPath.toPath()); } - System.out.println("File backed-up successfully!"); + info("File backed-up successfully!"); } catch (IOException ex) { - throw new RuntimeException("Error: Failed to take backup of the file. " + - "It might be due to file locks or permission issues."); + throw new IOException("Error: Failed to take backup of the file. Exception: " + ex, ex); } - LogSegmentPath pi = LogSegmentPath.matchLogSegment(this.segmentFile.toPath()); - if (pi == null) { - error("Deleting backup file as provided segmentFile is invalid."); - backupPath.delete(); - throw new RuntimeException("Invalid Segment File"); - } String tempOutput = segmentFile.getAbsolutePath() + ".skr.output"; File outputFile = createOutputFile(tempOutput); @@ -111,7 +121,7 @@ public void execute() throws Exception { logInputStream = getInputStream(pi); if (!isDryRun()) { outputStream = new SegmentedRaftLogOutputStream(outputFile, false, - 1024, 1024, ByteBuffer.allocateDirect(10 * 1024)); + 1024, 1024, ByteBuffer.allocateDirect(SizeInBytes.valueOf("8MB").getSizeInt())); } RaftProtos.LogEntryProto next; @@ -124,6 +134,8 @@ public void execute() throws Exception { if (next.getIndex() != index && !isDryRun()) { // all other logs will be written as it is outputStream.write(next); + outputStream.flush(); + info("Copied raft log for index (" + next.getIndex() + ")."); } else { // replace the transaction with a dummy OmEcho operation OzoneManagerProtocolProtos.OMRequest oldRequest = OMRatisHelper @@ -163,12 +175,12 @@ public void execute() throws Exception { .setStateMachineLogEntry(newEntry) .build(); - info("Replacing {" + oldRequest + "} with EchoRPC command at index " - + next.getIndex()); - if (!isDryRun()) { outputStream.write(newLogEntry); + outputStream.flush(); } + info("Replaced {" + oldRequest + "} with EchoRPC command at index " + + next.getIndex()); } } @@ -176,6 +188,7 @@ public void execute() throws Exception { outputStream.flush(); outputStream.close(); Files.move(outputFile.toPath(), segmentFile.toPath(), StandardCopyOption.REPLACE_EXISTING); + info("Moved temporary output file to correct raft log location : " + segmentFile.toPath()); } } catch (Exception ex) { @@ -198,17 +211,17 @@ private File createOutputFile(String name) throws IOException { File temp = new File(name); try { if (temp.exists()) { - throw new IOException("Error: File already exists - " + temp.getAbsolutePath()); - } else { - if (temp.createNewFile()) { - System.out.println("File created successfully: " + temp.getAbsolutePath()); - } else { - throw new IOException("Error: Failed to create file - " + temp.getAbsolutePath()); + error("Warning: Temporary output file already exists - " + temp.getAbsolutePath() + + ". Trying to delete it and create a new one."); + boolean success = temp.delete(); + if (!success) { + throw new IOException("Unable to delete old temporary file."); } } + temp.createNewFile(); + info("Temporary output file created successfully: " + temp.getAbsolutePath()); } catch (IOException e) { - error("Exception while trying to open output file: " + e.getMessage()); - throw e; + throw new IOException("Error: Failed to create temporary output file - " + temp.getAbsolutePath(), e); } return temp; } @@ -234,4 +247,43 @@ private SegmentedRaftLogInputStream getInputStream(LogSegmentPath pi) { throw new RuntimeException(ex); } } + + private File findSegmentFileContainingIndex() { + if (!logDir.exists() || !logDir.isDirectory()) { + throw new IllegalArgumentException("Invalid log directory: " + logDir); + } + + // Pattern to match Ratis log files: log_- or log_inprogress_ + Pattern pattern = Pattern.compile("log(?:_inprogress)?_(\\d+)(?:-(\\d+))?"); + + File[] segmentFiles = logDir.listFiles(); + if (segmentFiles == null) { + throw new IllegalArgumentException("Invalid log directory: " + logDir + ". No segment files present."); + } + + for (File file : segmentFiles) { + Matcher matcher = pattern.matcher(file.getName()); + if (matcher.matches()) { + long start = Long.parseLong(matcher.group(1)); + String endStr = matcher.group(2); + + // If it's an in-progress file, assume it contains all entries from start onwards + if (endStr == null) { + if (index >= start) { + info("Segment file \"" + file + "\" contains the index (" + index + ")."); + return file; + } + } else { + long end = Long.parseLong(endStr); + if (index >= start && index <= end) { + info("Segment file \"" + file + "\" contains the index (" + index + ")."); + return file; + } + } + } + } + + throw new IllegalArgumentException("Invalid index (" + index + + ") for log directory: \"" + logDir + "\". None of the segment files have the index."); + } } From 14b4a2d6cb6b62a8cada11128c3ed5a6892a95c0 Mon Sep 17 00:00:00 2001 From: tejaskriya Date: Tue, 8 Apr 2025 14:40:41 +0530 Subject: [PATCH 06/13] backup path sanity check --- .../org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java index 562ef23fb944..5b1264754933 100644 --- a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java +++ b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java @@ -89,6 +89,10 @@ public void execute() throws Exception { segmentFile = findSegmentFileContainingIndex(); } + if (segmentFile.toPath().equals(backupPath)) { + throw new IOException("Backup path cannot be same as segment file path."); + } + LogSegmentPath pi = LogSegmentPath.matchLogSegment(this.segmentFile.toPath()); if (pi == null) { throw new IOException("Invalid Segment File"); From 726b8a5349d8d899391c31003fb691dc1ed5bccd Mon Sep 17 00:00:00 2001 From: tejaskriya Date: Tue, 8 Apr 2025 17:21:24 +0530 Subject: [PATCH 07/13] findBugs fixes --- .../ozone/repair/om/OMRatisLogRepair.java | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java index 5b1264754933..4decca7f7717 100644 --- a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java +++ b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java @@ -89,7 +89,7 @@ public void execute() throws Exception { segmentFile = findSegmentFileContainingIndex(); } - if (segmentFile.toPath().equals(backupPath)) { + if (segmentFile.toPath().equals(backupPath.toPath())) { throw new IOException("Backup path cannot be same as segment file path."); } @@ -206,7 +206,10 @@ public void execute() throws Exception { outputStream.close(); } if (isDryRun()) { - outputFile.delete(); + boolean success = outputFile.delete(); + if (!success) { + error("Error: Could not delete temporary output file \"" + outputFile + "\"."); + } } } } @@ -222,9 +225,13 @@ private File createOutputFile(String name) throws IOException { throw new IOException("Unable to delete old temporary file."); } } - temp.createNewFile(); - info("Temporary output file created successfully: " + temp.getAbsolutePath()); - } catch (IOException e) { + boolean success = temp.createNewFile(); + if (success) { + info("Temporary output file created successfully: " + temp.getAbsolutePath()); + } else { + throw new IOException("createNewFile() failed."); + } + } catch (Exception e) { throw new IOException("Error: Failed to create temporary output file - " + temp.getAbsolutePath(), e); } return temp; From 85ccd6a5c266e395a8783ac6a6204ee8e045598f Mon Sep 17 00:00:00 2001 From: tejaskriya Date: Tue, 15 Apr 2025 16:36:02 +0530 Subject: [PATCH 08/13] make options mutually exclusive, take backup dir, other small improvements --- .../ozone/repair/om/OMRatisLogRepair.java | 157 +++++++----------- 1 file changed, 63 insertions(+), 94 deletions(-) diff --git a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java index 4decca7f7717..ba2b31d961bd 100644 --- a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java +++ b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java @@ -22,7 +22,6 @@ import java.io.File; import java.io.IOException; import java.lang.reflect.Constructor; -import java.lang.reflect.InvocationTargetException; import java.nio.ByteBuffer; import java.nio.file.Files; import java.nio.file.StandardCopyOption; @@ -61,18 +60,23 @@ ) public class OMRatisLogRepair extends RepairTool { - @CommandLine.Option(names = {"-s", "--segment-path", "--segmentPath"}, - description = "Path of the input segment file") - private File segmentFile; + @CommandLine.ArgGroup(multiplicity = "1") + private ExclusiveArguments exclusiveArguments; - @CommandLine.Option(names = {"-d", "--ratis-log-dir", "--ratisLogDir"}, - description = "Path of the ratis log directory") - private File logDir; + private static final class ExclusiveArguments { + @CommandLine.Option(names = {"-s", "--segment-path"}, + description = "Path of the input segment file") + private File segmentFile; + + @CommandLine.Option(names = {"-d", "--ratis-log-dir"}, + description = "Path of the ratis log directory") + private File logDir; + } @CommandLine.Option(names = {"-b", "--backup"}, required = true, - description = "Path to put the backup of the original repaired segment file") - private File backupPath; + description = "Directory to put the backup of the original repaired segment file before the repair.") + private File backupDir; @CommandLine.Option(names = {"--index"}, required = true, @@ -82,47 +86,51 @@ public class OMRatisLogRepair extends RepairTool { @Override public void execute() throws Exception { - if (segmentFile == null && logDir == null) { - throw new IllegalArgumentException("Path to either a segment-file or ratis-log-dir must be provided."); - } - if (segmentFile == null) { - segmentFile = findSegmentFileContainingIndex(); + if (exclusiveArguments.logDir != null) { + exclusiveArguments.segmentFile = findSegmentFileContainingIndex(); } - if (segmentFile.toPath().equals(backupPath.toPath())) { - throw new IOException("Backup path cannot be same as segment file path."); + if (exclusiveArguments.segmentFile.getParentFile().toPath().equals(backupDir.toPath())) { + throw new IOException("Backup directory cannot be same as segment file's parent directory."); } - LogSegmentPath pi = LogSegmentPath.matchLogSegment(this.segmentFile.toPath()); + LogSegmentPath pi = LogSegmentPath.matchLogSegment(this.exclusiveArguments.segmentFile.toPath()); if (pi == null) { throw new IOException("Invalid Segment File"); } - if (!segmentFile.exists()) { - throw new IOException("Error: Source segment file \"" + segmentFile + "\" does not exist."); + if (!exclusiveArguments.segmentFile.exists()) { + throw new IOException("Error: Source segment file \"" + exclusiveArguments.segmentFile + "\" does not exist."); } + if (!backupDir.exists()) { + info("BackupDir \"" + backupDir + "\" does not exist. Creating the directory path."); + Files.createDirectories(backupDir.toPath()); + } + + File backupPath = new File(backupDir, exclusiveArguments.segmentFile.getName()); if (backupPath.exists()) { - throw new IOException("Error: Backup file for segment file \"" + backupPath + "\" already exists."); + throw new IOException("Error: Backup file for segment file \"" + exclusiveArguments.segmentFile + + "\" already exists. Either delete the old backup or provide a different directory to take the backup."); } - try { - info("Taking back up of Raft Log file: " + this.segmentFile.getAbsolutePath() + " to location: " + backupPath); - if (!isDryRun()) { - Files.copy(segmentFile.toPath(), backupPath.toPath()); - } - info("File backed-up successfully!"); - } catch (IOException ex) { - throw new IOException("Error: Failed to take backup of the file. Exception: " + ex, ex); + info("Taking back up of Raft Log file: " + this.exclusiveArguments.segmentFile.getAbsolutePath() + + " to location: " + backupPath); + if (!isDryRun()) { + Files.copy(exclusiveArguments.segmentFile.toPath(), backupPath.toPath()); } + info("File backed-up successfully!"); - String tempOutput = segmentFile.getAbsolutePath() + ".skr.output"; - File outputFile = createOutputFile(tempOutput); + File outputFile = null; + if (!isDryRun()) { + outputFile = File.createTempFile("srt-output", null, backupDir); + outputFile.deleteOnExit(); + info("Created temporary output file: " + outputFile.toPath()); + } - info("Processing Raft Log file: " + this.segmentFile.getAbsolutePath() + " size:" + this.segmentFile.length()); + info("Processing Raft Log file: " + this.exclusiveArguments.segmentFile.getAbsolutePath() + " size:" + + this.exclusiveArguments.segmentFile.length()); SegmentedRaftLogOutputStream outputStream = null; - SegmentedRaftLogInputStream logInputStream = null; - try { - logInputStream = getInputStream(pi); + try (SegmentedRaftLogInputStream logInputStream = getInputStream(pi)) { if (!isDryRun()) { outputStream = new SegmentedRaftLogOutputStream(outputFile, false, 1024, 1024, ByteBuffer.allocateDirect(SizeInBytes.valueOf("8MB").getSizeInt())); @@ -191,85 +199,46 @@ public void execute() throws Exception { if (!isDryRun()) { outputStream.flush(); outputStream.close(); - Files.move(outputFile.toPath(), segmentFile.toPath(), StandardCopyOption.REPLACE_EXISTING); - info("Moved temporary output file to correct raft log location : " + segmentFile.toPath()); + Files.move(outputFile.toPath(), exclusiveArguments.segmentFile.toPath(), + StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE); + info("Moved temporary output file to correct raft log location : " + exclusiveArguments.segmentFile.toPath()); } - } catch (Exception ex) { - error("Exception: " + ex); } finally { - if (logInputStream != null) { - logInputStream.close(); - } if (outputStream != null) { outputStream.flush(); outputStream.close(); } - if (isDryRun()) { - boolean success = outputFile.delete(); - if (!success) { - error("Error: Could not delete temporary output file \"" + outputFile + "\"."); - } - } } } - private File createOutputFile(String name) throws IOException { - File temp = new File(name); - try { - if (temp.exists()) { - error("Warning: Temporary output file already exists - " + temp.getAbsolutePath() + - ". Trying to delete it and create a new one."); - boolean success = temp.delete(); - if (!success) { - throw new IOException("Unable to delete old temporary file."); - } - } - boolean success = temp.createNewFile(); - if (success) { - info("Temporary output file created successfully: " + temp.getAbsolutePath()); - } else { - throw new IOException("createNewFile() failed."); - } - } catch (Exception e) { - throw new IOException("Error: Failed to create temporary output file - " + temp.getAbsolutePath(), e); - } - return temp; - } - - private SegmentedRaftLogInputStream getInputStream(LogSegmentPath pi) { - try { - Class logInputStreamClass = - Class.forName("org.apache.ratis.server.raftlog.segmented.SegmentedRaftLogInputStream"); - Constructor constructor = logInputStreamClass.getDeclaredConstructor(File.class, LogSegmentStartEnd.class, - SizeInBytes.class, SegmentedRaftLogMetrics.class); - constructor.setAccessible(true); - SegmentedRaftLogInputStream inputStream = - (SegmentedRaftLogInputStream) constructor.newInstance(segmentFile, pi.getStartEnd(), - SizeInBytes.valueOf("32MB"), null); - if (inputStream == null) { - throw new RuntimeException("logInputStream is null. Constructor might have failed."); - } - return inputStream; - - } catch (ClassNotFoundException | NoSuchMethodException | SecurityException | - InvocationTargetException | InstantiationException | IllegalAccessException ex) { - error("Exception while trying to get input stream for segment file : " + ex); - throw new RuntimeException(ex); + private SegmentedRaftLogInputStream getInputStream(LogSegmentPath pi) throws Exception { + Class logInputStreamClass = + Class.forName("org.apache.ratis.server.raftlog.segmented.SegmentedRaftLogInputStream"); + Constructor constructor = logInputStreamClass.getDeclaredConstructor(File.class, LogSegmentStartEnd.class, + SizeInBytes.class, SegmentedRaftLogMetrics.class); + constructor.setAccessible(true); + SegmentedRaftLogInputStream inputStream = + (SegmentedRaftLogInputStream) constructor.newInstance(exclusiveArguments.segmentFile, pi.getStartEnd(), + SizeInBytes.valueOf("32MB"), null); + if (inputStream == null) { + throw new RuntimeException("logInputStream is null. Constructor might have failed."); } + return inputStream; } private File findSegmentFileContainingIndex() { - if (!logDir.exists() || !logDir.isDirectory()) { - throw new IllegalArgumentException("Invalid log directory: " + logDir); + if (!exclusiveArguments.logDir.exists() || !exclusiveArguments.logDir.isDirectory()) { + throw new IllegalArgumentException("Invalid log directory: " + exclusiveArguments.logDir); } // Pattern to match Ratis log files: log_- or log_inprogress_ Pattern pattern = Pattern.compile("log(?:_inprogress)?_(\\d+)(?:-(\\d+))?"); - File[] segmentFiles = logDir.listFiles(); + File[] segmentFiles = exclusiveArguments.logDir.listFiles(); if (segmentFiles == null) { - throw new IllegalArgumentException("Invalid log directory: " + logDir + ". No segment files present."); + throw new IllegalArgumentException("Invalid log directory: " + exclusiveArguments.logDir + + ". No segment files present."); } for (File file : segmentFiles) { @@ -295,6 +264,6 @@ private File findSegmentFileContainingIndex() { } throw new IllegalArgumentException("Invalid index (" + index - + ") for log directory: \"" + logDir + "\". None of the segment files have the index."); + + ") for log directory: \"" + exclusiveArguments.logDir + "\". None of the segment files have the index."); } } From 4881c7c985e74edd10046fdb2456ce5de396c593 Mon Sep 17 00:00:00 2001 From: tejaskriya Date: Tue, 15 Apr 2025 16:54:17 +0530 Subject: [PATCH 09/13] clean up dry run --- .../hadoop/ozone/repair/om/OMRatisLogRepair.java | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java index ba2b31d961bd..3bbbe2800ae6 100644 --- a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java +++ b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java @@ -104,7 +104,9 @@ public void execute() throws Exception { } if (!backupDir.exists()) { info("BackupDir \"" + backupDir + "\" does not exist. Creating the directory path."); - Files.createDirectories(backupDir.toPath()); + if (!isDryRun()) { + Files.createDirectories(backupDir.toPath()); + } } File backupPath = new File(backupDir, exclusiveArguments.segmentFile.getName()); @@ -123,8 +125,8 @@ public void execute() throws Exception { if (!isDryRun()) { outputFile = File.createTempFile("srt-output", null, backupDir); outputFile.deleteOnExit(); - info("Created temporary output file: " + outputFile.toPath()); } + info("Created temporary output file: " + (outputFile == null ? null : outputFile.toPath())); info("Processing Raft Log file: " + this.exclusiveArguments.segmentFile.getAbsolutePath() + " size:" + this.exclusiveArguments.segmentFile.length()); @@ -143,10 +145,12 @@ public void execute() throws Exception { "gap between entry %s and entry %s", prev, next); } - if (next.getIndex() != index && !isDryRun()) { + if (next.getIndex() != index) { // all other logs will be written as it is - outputStream.write(next); - outputStream.flush(); + if (!isDryRun()) { + outputStream.write(next); + outputStream.flush(); + } info("Copied raft log for index (" + next.getIndex() + ")."); } else { // replace the transaction with a dummy OmEcho operation @@ -201,8 +205,8 @@ public void execute() throws Exception { outputStream.close(); Files.move(outputFile.toPath(), exclusiveArguments.segmentFile.toPath(), StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE); - info("Moved temporary output file to correct raft log location : " + exclusiveArguments.segmentFile.toPath()); } + info("Moved temporary output file to correct raft log location : " + exclusiveArguments.segmentFile.toPath()); } finally { if (outputStream != null) { From 3ddf02cc77c0b90d22a106c66c3c2971be4bc345 Mon Sep 17 00:00:00 2001 From: tejaskriya Date: Thu, 17 Apr 2025 19:30:09 +0530 Subject: [PATCH 10/13] Use logSegment.readSegmentFile from ratis --- .../ozone/repair/om/OMRatisLogRepair.java | 160 +++++++----------- 1 file changed, 65 insertions(+), 95 deletions(-) diff --git a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java index 3bbbe2800ae6..abd5eae691d2 100644 --- a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java +++ b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java @@ -21,7 +21,6 @@ import java.io.File; import java.io.IOException; -import java.lang.reflect.Constructor; import java.nio.ByteBuffer; import java.nio.file.Files; import java.nio.file.StandardCopyOption; @@ -32,12 +31,11 @@ import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos; import org.apache.hadoop.ozone.repair.RepairTool; import org.apache.ratis.proto.RaftProtos; +import org.apache.ratis.server.RaftServerConfigKeys; import org.apache.ratis.server.metrics.SegmentedRaftLogMetrics; +import org.apache.ratis.server.raftlog.segmented.LogSegment; import org.apache.ratis.server.raftlog.segmented.LogSegmentPath; -import org.apache.ratis.server.raftlog.segmented.LogSegmentStartEnd; -import org.apache.ratis.server.raftlog.segmented.SegmentedRaftLogInputStream; import org.apache.ratis.server.raftlog.segmented.SegmentedRaftLogOutputStream; -import org.apache.ratis.util.Preconditions; import org.apache.ratis.util.SizeInBytes; import picocli.CommandLine; @@ -83,6 +81,8 @@ private static final class ExclusiveArguments { description = "Index of the failing transaction that should be removed") private long index; + private SegmentedRaftLogOutputStream outputStream = null; + @Override public void execute() throws Exception { @@ -130,107 +130,77 @@ public void execute() throws Exception { info("Processing Raft Log file: " + this.exclusiveArguments.segmentFile.getAbsolutePath() + " size:" + this.exclusiveArguments.segmentFile.length()); - SegmentedRaftLogOutputStream outputStream = null; - - try (SegmentedRaftLogInputStream logInputStream = getInputStream(pi)) { - if (!isDryRun()) { - outputStream = new SegmentedRaftLogOutputStream(outputFile, false, - 1024, 1024, ByteBuffer.allocateDirect(SizeInBytes.valueOf("8MB").getSizeInt())); - } - RaftProtos.LogEntryProto next; - for (RaftProtos.LogEntryProto prev = null; (next = logInputStream.nextEntry()) != null; prev = next) { - if (prev != null) { - Preconditions.assertTrue(next.getIndex() == prev.getIndex() + 1L, - "gap between entry %s and entry %s", prev, next); - } - - if (next.getIndex() != index) { - // all other logs will be written as it is - if (!isDryRun()) { - outputStream.write(next); - outputStream.flush(); - } - info("Copied raft log for index (" + next.getIndex() + ")."); - } else { - // replace the transaction with a dummy OmEcho operation - OzoneManagerProtocolProtos.OMRequest oldRequest = OMRatisHelper - .convertByteStringToOMRequest(next.getStateMachineLogEntry().getLogData()); - OzoneManagerProtocolProtos.OMRequest.Builder newRequest = OzoneManagerProtocolProtos.OMRequest.newBuilder() - .setCmdType(EchoRPC) - .setClientId(oldRequest.getClientId()) - .setEchoRPCRequest(OzoneManagerProtocolProtos.EchoRPCRequest.newBuilder().build()); - - if (oldRequest.hasUserInfo()) { - newRequest.setUserInfo(oldRequest.getUserInfo()); - } - if (oldRequest.hasTraceID()) { - newRequest.setTraceID(oldRequest.getTraceID()); - } - if (oldRequest.hasLayoutVersion()) { - newRequest.setLayoutVersion(oldRequest.getLayoutVersion()); - } - if (oldRequest.hasVersion()) { - newRequest.setVersion(oldRequest.getVersion()); - } + if (!isDryRun()) { + outputStream = new SegmentedRaftLogOutputStream(outputFile, false, + 1024, 1024, ByteBuffer.allocateDirect(SizeInBytes.valueOf("8MB").getSizeInt())); + } - RaftProtos.StateMachineLogEntryProto oldEntry = next.getStateMachineLogEntry(); - RaftProtos.StateMachineLogEntryProto.Builder newEntry = - RaftProtos.StateMachineLogEntryProto.newBuilder() - .setCallId(oldEntry.getCallId()) - .setClientId(oldEntry.getClientId()) - .setType(oldEntry.getType()) - .setLogData(OMRatisHelper.convertRequestToByteString(newRequest.build())); - if (oldEntry.hasStateMachineEntry()) { - newEntry.setStateMachineEntry(oldEntry.getStateMachineEntry()); - } + int entryCount = LogSegment.readSegmentFile(exclusiveArguments.segmentFile, pi.getStartEnd(), + SizeInBytes.valueOf("32MB"), RaftServerConfigKeys.Log.CorruptionPolicy.EXCEPTION, null, this::processLogEntry); + if (!isDryRun()) { + outputStream.flush(); + outputStream.close(); + Files.move(outputFile.toPath(), exclusiveArguments.segmentFile.toPath(), + StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE); + } + info("Moved temporary output file to correct raft log location : " + exclusiveArguments.segmentFile.toPath()); - RaftProtos.LogEntryProto newLogEntry = RaftProtos.LogEntryProto.newBuilder() - .setTerm(next.getTerm()) - .setIndex(next.getIndex()) - .setStateMachineLogEntry(newEntry) - .build(); + if (outputStream != null) { + outputStream.flush(); + outputStream.close(); + } + } - if (!isDryRun()) { - outputStream.write(newLogEntry); - outputStream.flush(); - } - info("Replaced {" + oldRequest + "} with EchoRPC command at index " - + next.getIndex()); + public void processLogEntry(RaftProtos.LogEntryProto proto) { + try { + if (proto.getIndex() != index) { + // all other logs will be written as it is + if (!isDryRun()) { + outputStream.write(proto); + outputStream.flush(); + } + info("Copied raft log for index (" + proto.getIndex() + ")."); + } else { + // replace the transaction with a dummy OmEcho operation + RaftProtos.StateMachineLogEntryProto oldEntry = proto.getStateMachineLogEntry(); + OzoneManagerProtocolProtos.OMRequest.Builder newRequest = OzoneManagerProtocolProtos.OMRequest.newBuilder() + .setCmdType(EchoRPC) + .setClientId("skip-ratis-transaction-repair-tool") + .setEchoRPCRequest(OzoneManagerProtocolProtos.EchoRPCRequest.newBuilder().build()); + + info("Retaining the same { callid: " + oldEntry.getCallId() + ", clientID: " + + oldEntry.getClientId().toString() + ", type: " + oldEntry.getType() + + " } for EchoRPC as the old request"); + + RaftProtos.StateMachineLogEntryProto.Builder newEntry = + RaftProtos.StateMachineLogEntryProto.newBuilder() + .setCallId(oldEntry.getCallId()) + .setClientId(oldEntry.getClientId()) + .setType(oldEntry.getType()) + .setLogData(OMRatisHelper.convertRequestToByteString(newRequest.build())); + + RaftProtos.LogEntryProto newLogEntry = RaftProtos.LogEntryProto.newBuilder() + .setTerm(proto.getTerm()) + .setIndex(proto.getIndex()) + .setStateMachineLogEntry(newEntry) + .build(); + + if (!isDryRun()) { + outputStream.write(newLogEntry); + outputStream.flush(); } - } - - if (!isDryRun()) { - outputStream.flush(); - outputStream.close(); - Files.move(outputFile.toPath(), exclusiveArguments.segmentFile.toPath(), - StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE); - } - info("Moved temporary output file to correct raft log location : " + exclusiveArguments.segmentFile.toPath()); - } finally { - if (outputStream != null) { - outputStream.flush(); - outputStream.close(); + OzoneManagerProtocolProtos.OMRequest oldRequest = OMRatisHelper + .convertByteStringToOMRequest(oldEntry.getLogData()); + info("Replaced {" + oldRequest.toString().replace("\n", " ") + + "} with EchoRPC command at index " + proto.getIndex()); } + } catch (IOException ex) { + throw new RuntimeException("Error while processing logEntry: (" + proto.getIndex() + "). Exception: " + ex); } } - private SegmentedRaftLogInputStream getInputStream(LogSegmentPath pi) throws Exception { - Class logInputStreamClass = - Class.forName("org.apache.ratis.server.raftlog.segmented.SegmentedRaftLogInputStream"); - Constructor constructor = logInputStreamClass.getDeclaredConstructor(File.class, LogSegmentStartEnd.class, - SizeInBytes.class, SegmentedRaftLogMetrics.class); - constructor.setAccessible(true); - SegmentedRaftLogInputStream inputStream = - (SegmentedRaftLogInputStream) constructor.newInstance(exclusiveArguments.segmentFile, pi.getStartEnd(), - SizeInBytes.valueOf("32MB"), null); - if (inputStream == null) { - throw new RuntimeException("logInputStream is null. Constructor might have failed."); - } - return inputStream; - } - private File findSegmentFileContainingIndex() { if (!exclusiveArguments.logDir.exists() || !exclusiveArguments.logDir.isDirectory()) { throw new IllegalArgumentException("Invalid log directory: " + exclusiveArguments.logDir); From 06dbf8a4b09d0ac735e702dc5cb81a61721b4d76 Mon Sep 17 00:00:00 2001 From: tejaskriya Date: Thu, 17 Apr 2025 20:16:32 +0530 Subject: [PATCH 11/13] Checkstyle and findbugs issues --- .../org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java index abd5eae691d2..cc0ce8e1ddb4 100644 --- a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java +++ b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java @@ -32,7 +32,6 @@ import org.apache.hadoop.ozone.repair.RepairTool; import org.apache.ratis.proto.RaftProtos; import org.apache.ratis.server.RaftServerConfigKeys; -import org.apache.ratis.server.metrics.SegmentedRaftLogMetrics; import org.apache.ratis.server.raftlog.segmented.LogSegment; import org.apache.ratis.server.raftlog.segmented.LogSegmentPath; import org.apache.ratis.server.raftlog.segmented.SegmentedRaftLogOutputStream; @@ -144,6 +143,7 @@ public void execute() throws Exception { Files.move(outputFile.toPath(), exclusiveArguments.segmentFile.toPath(), StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE); } + info("Finished processing all the entries (" + entryCount + " logs) from the segment file."); info("Moved temporary output file to correct raft log location : " + exclusiveArguments.segmentFile.toPath()); if (outputStream != null) { From c8e7aff1a19eeacd6e8408cf24af5ba30721c2e0 Mon Sep 17 00:00:00 2001 From: "Doroszlai, Attila" Date: Tue, 29 Apr 2025 11:05:49 +0200 Subject: [PATCH 12/13] fix pmd --- .../ozone/repair/om/OMRatisLogRepair.java | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java index cc0ce8e1ddb4..fce7ed63f8df 100644 --- a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java +++ b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java @@ -38,7 +38,6 @@ import org.apache.ratis.util.SizeInBytes; import picocli.CommandLine; - /** * Tool to omit a raft log in a ratis segment file. */ @@ -60,16 +59,6 @@ public class OMRatisLogRepair extends RepairTool { @CommandLine.ArgGroup(multiplicity = "1") private ExclusiveArguments exclusiveArguments; - private static final class ExclusiveArguments { - @CommandLine.Option(names = {"-s", "--segment-path"}, - description = "Path of the input segment file") - private File segmentFile; - - @CommandLine.Option(names = {"-d", "--ratis-log-dir"}, - description = "Path of the ratis log directory") - private File logDir; - } - @CommandLine.Option(names = {"-b", "--backup"}, required = true, description = "Directory to put the backup of the original repaired segment file before the repair.") @@ -240,4 +229,14 @@ private File findSegmentFileContainingIndex() { throw new IllegalArgumentException("Invalid index (" + index + ") for log directory: \"" + exclusiveArguments.logDir + "\". None of the segment files have the index."); } + + private static final class ExclusiveArguments { + @CommandLine.Option(names = {"-s", "--segment-path"}, + description = "Path of the input segment file") + private File segmentFile; + + @CommandLine.Option(names = {"-d", "--ratis-log-dir"}, + description = "Path of the ratis log directory") + private File logDir; + } } From 4248e84aa4ce0bb306d4a81875e24f942efa369a Mon Sep 17 00:00:00 2001 From: tejaskriya Date: Mon, 5 May 2025 13:53:12 +0530 Subject: [PATCH 13/13] Fix description and refactor processLogEntry --- .../ozone/repair/om/OMRatisLogRepair.java | 77 +++++++------------ 1 file changed, 26 insertions(+), 51 deletions(-) diff --git a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java index fce7ed63f8df..a98d268d12e4 100644 --- a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java +++ b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/OMRatisLogRepair.java @@ -47,10 +47,10 @@ description = "CLI to omit a raft log in a ratis segment file. The raft log at the index specified " + "is replaced with an EchoOM command (which is a dummy command). It is an offline command " + "i.e., doesn't require OM to be running. " + - "The command should be run for the same transaction on all 3 OMs only when they all OMs are crashing " + - "while applying the same transaction. If there is only one OM that is crashing and " + - "other OMs have executed the log successfully, then the DB should manually copied from one of the good OMs " + - "to the crashing OM instead.", + "The command should be run for the same transaction on all 3 OMs only when all the OMs are crashing " + + "while applying the same transaction. If only one OM is crashing and the " + + "other OMs have executed the log successfully, then the DB should be manually copied " + + "from one of the good OMs to the crashing OM instead.", mixinStandardHelpOptions = true, versionProvider = HddsVersionProvider.class ) @@ -114,7 +114,7 @@ public void execute() throws Exception { outputFile = File.createTempFile("srt-output", null, backupDir); outputFile.deleteOnExit(); } - info("Created temporary output file: " + (outputFile == null ? null : outputFile.toPath())); + info("Created temporary output file: " + (outputFile == null ? "" : outputFile.toPath())); info("Processing Raft Log file: " + this.exclusiveArguments.segmentFile.getAbsolutePath() + " size:" + this.exclusiveArguments.segmentFile.length()); @@ -135,61 +135,36 @@ public void execute() throws Exception { info("Finished processing all the entries (" + entryCount + " logs) from the segment file."); info("Moved temporary output file to correct raft log location : " + exclusiveArguments.segmentFile.toPath()); - if (outputStream != null) { - outputStream.flush(); - outputStream.close(); - } } - public void processLogEntry(RaftProtos.LogEntryProto proto) { + private void processLogEntry(RaftProtos.LogEntryProto proto) { try { - if (proto.getIndex() != index) { - // all other logs will be written as it is - if (!isDryRun()) { - outputStream.write(proto); - outputStream.flush(); - } - info("Copied raft log for index (" + proto.getIndex() + ")."); - } else { - // replace the transaction with a dummy OmEcho operation - RaftProtos.StateMachineLogEntryProto oldEntry = proto.getStateMachineLogEntry(); - OzoneManagerProtocolProtos.OMRequest.Builder newRequest = OzoneManagerProtocolProtos.OMRequest.newBuilder() - .setCmdType(EchoRPC) - .setClientId("skip-ratis-transaction-repair-tool") - .setEchoRPCRequest(OzoneManagerProtocolProtos.EchoRPCRequest.newBuilder().build()); - - info("Retaining the same { callid: " + oldEntry.getCallId() + ", clientID: " + - oldEntry.getClientId().toString() + ", type: " + oldEntry.getType() + - " } for EchoRPC as the old request"); - - RaftProtos.StateMachineLogEntryProto.Builder newEntry = - RaftProtos.StateMachineLogEntryProto.newBuilder() - .setCallId(oldEntry.getCallId()) - .setClientId(oldEntry.getClientId()) - .setType(oldEntry.getType()) - .setLogData(OMRatisHelper.convertRequestToByteString(newRequest.build())); - - RaftProtos.LogEntryProto newLogEntry = RaftProtos.LogEntryProto.newBuilder() - .setTerm(proto.getTerm()) - .setIndex(proto.getIndex()) - .setStateMachineLogEntry(newEntry) - .build(); - - if (!isDryRun()) { - outputStream.write(newLogEntry); - outputStream.flush(); - } - - OzoneManagerProtocolProtos.OMRequest oldRequest = OMRatisHelper - .convertByteStringToOMRequest(oldEntry.getLogData()); - info("Replaced {" + oldRequest.toString().replace("\n", " ") - + "} with EchoRPC command at index " + proto.getIndex()); + RaftProtos.LogEntryProto newLogEntry = proto.getIndex() != index ? proto : getOmEchoLogEntry(proto); + if (!isDryRun()) { + outputStream.write(newLogEntry); + outputStream.flush(); } } catch (IOException ex) { throw new RuntimeException("Error while processing logEntry: (" + proto.getIndex() + "). Exception: " + ex); } } + private RaftProtos.LogEntryProto getOmEchoLogEntry(RaftProtos.LogEntryProto proto) throws IOException { + OzoneManagerProtocolProtos.OMRequest.Builder newRequest = OzoneManagerProtocolProtos.OMRequest.newBuilder() + .setCmdType(EchoRPC) + .setClientId("skip-ratis-transaction-repair-tool") + .setEchoRPCRequest(OzoneManagerProtocolProtos.EchoRPCRequest.newBuilder().build()); + RaftProtos.StateMachineLogEntryProto.Builder entry = proto.getStateMachineLogEntry().toBuilder() + .setLogData(OMRatisHelper.convertRequestToByteString(newRequest.build())); + OzoneManagerProtocolProtos.OMRequest oldRequest = OMRatisHelper + .convertByteStringToOMRequest(proto.getStateMachineLogEntry().getLogData()); + info("Replacing {" + oldRequest.toString().replace("\n", " ") + + "} with EchoRPC command at index " + proto.getIndex()); + return proto.toBuilder() + .setStateMachineLogEntry(entry) + .build(); + } + private File findSegmentFileContainingIndex() { if (!exclusiveArguments.logDir.exists() || !exclusiveArguments.logDir.isDirectory()) { throw new IllegalArgumentException("Invalid log directory: " + exclusiveArguments.logDir);