-
Notifications
You must be signed in to change notification settings - Fork 3.4k
HBASE-23085: Network and Data related Actions #675
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c7f6bc7
d6bcf6b
730169b
4be3179
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,69 @@ | ||
| /** | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.hadoop.hbase.chaos.actions; | ||
|
|
||
| import java.io.IOException; | ||
|
|
||
| import org.apache.hadoop.hbase.ServerName; | ||
| import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; | ||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
|
|
||
| /** | ||
| * Action that adds high cpu load to a random regionserver for a given duration | ||
| */ | ||
| public class AddCPULoadAction extends SudoCommandAction { | ||
| protected static final Logger LOG = LoggerFactory.getLogger(AddCPULoadAction.class); | ||
| private static final String CPU_LOAD_COMMAND = | ||
| "seq 1 %s | xargs -I{} -n 1 -P %s timeout %s dd if=/dev/urandom of=/dev/null bs=1M " + | ||
| "iflag=fullblock"; | ||
|
Comment on lines
+34
to
+35
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. %s is used but numbers are added to the String.format arguments.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That is intentional. %s uses Integer.toString, which is predictable while %d uses locale specific formatting that might change. |
||
|
|
||
| private final long duration; | ||
| private long processes; | ||
|
|
||
| /** | ||
| * Add high load to cpu | ||
| * | ||
| * @param duration Duration that this thread should generate the load for in milliseconds | ||
| * @param processes The number of parallel processes, should be equal to cpu threads for max load | ||
| */ | ||
| public AddCPULoadAction(long duration, long processes, long timeout) { | ||
| super(timeout); | ||
| this.duration = duration; | ||
| this.processes = processes; | ||
| } | ||
|
|
||
| protected void localPerform() throws IOException { | ||
| LOG.info("Starting to execute AddCPULoadAction"); | ||
| ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getCurrentServers()); | ||
| String hostname = server.getHostname(); | ||
|
|
||
| try { | ||
| clusterManager.execSudo(hostname, timeout, getCommand()); | ||
| } catch (IOException ex){ | ||
| //This will always happen. We use timeout to kill a continously running process | ||
| //after the duration expires | ||
|
Comment on lines
+60
to
+61
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should add
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @meszibalu As discussed for /bin/true to make a difference we should increase the outer timeout and that would not be much better so I will leave it as it is. |
||
| } | ||
| LOG.info("Finished to execute AddCPULoadAction"); | ||
| } | ||
|
|
||
| private String getCommand(){ | ||
| return String.format(CPU_LOAD_COMMAND, processes, processes, duration/1000f); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,75 @@ | ||
| /** | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.hadoop.hbase.chaos.actions; | ||
|
|
||
| import org.apache.commons.lang3.RandomUtils; | ||
| import org.apache.hadoop.fs.FSDataOutputStream; | ||
| import org.apache.hadoop.fs.FileSystem; | ||
| import org.apache.hadoop.fs.LocatedFileStatus; | ||
| import org.apache.hadoop.fs.Path; | ||
| import org.apache.hadoop.fs.RemoteIterator; | ||
| import org.apache.hadoop.hbase.io.hfile.HFile; | ||
| import org.apache.hadoop.hbase.util.CommonFSUtils; | ||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
|
|
||
| /** | ||
| * Action corrupts HFiles with a certain chance. | ||
| */ | ||
| public class CorruptDataFilesAction extends Action { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Neat idea, but how could we tell via automation when a file was expectedly corrupted vs. unexpectedly?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can't as far as I'm aware. It's not clear to me what is the intended use of these tests, but they were requested by stack so I added them. They are so destructive I couldn't eve restart hbase after running them and had to delete every hbase related data from hdfs and zokeeper to be able to run hbase on the cluster again.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was worried with corrupting something critical like hbase:meta, a table descriptor, or something like that. I think corrupting a single hfile for a user-table is a more "reasonable" failure condition which wouldn't have long-lasting impact on the ability for HBase to keep working. @saintstack that jive with what you were thinking or you have something else in mind?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Resolving this -- defaulting to |
||
| private static final Logger LOG = LoggerFactory.getLogger(CorruptDataFilesAction.class); | ||
| private float chance; | ||
|
|
||
| /** | ||
| * Corrupts HFiles with a certain chance | ||
| * @param chance chance to corrupt any give data file (0.5 => 50%) | ||
| */ | ||
| public CorruptDataFilesAction(float chance) { | ||
| this.chance = chance * 100; | ||
| } | ||
|
|
||
| @Override | ||
| public void perform() throws Exception { | ||
| LOG.info("Start corrupting data files"); | ||
|
|
||
| FileSystem fs = CommonFSUtils.getRootDirFileSystem(getConf()); | ||
| Path rootDir = CommonFSUtils.getRootDir(getConf()); | ||
| Path defaultDir = rootDir.suffix("/data/default"); | ||
| RemoteIterator<LocatedFileStatus> iterator = fs.listFiles(defaultDir, true); | ||
| while (iterator.hasNext()){ | ||
| LocatedFileStatus status = iterator.next(); | ||
| if(!HFile.isHFileFormat(fs, status.getPath())){ | ||
| continue; | ||
| } | ||
| if(RandomUtils.nextFloat(0, 100) > chance){ | ||
| continue; | ||
| } | ||
|
|
||
| FSDataOutputStream out = fs.create(status.getPath(), true); | ||
| try { | ||
| out.write(0); | ||
| } finally { | ||
| out.close(); | ||
| } | ||
| LOG.info("Corrupting {}", status.getPath()); | ||
| } | ||
| LOG.info("Done corrupting data files"); | ||
| } | ||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,72 @@ | ||
| /** | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.hadoop.hbase.chaos.actions; | ||
|
|
||
| import java.io.IOException; | ||
|
|
||
| import org.apache.hadoop.hbase.ServerName; | ||
| import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; | ||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
|
|
||
| /** | ||
| * | ||
| * Corrupt network packages on a random regionserver. | ||
| */ | ||
| public class CorruptPackagesCommandAction extends TCCommandAction { | ||
| private static final Logger LOG = LoggerFactory.getLogger(CorruptPackagesCommandAction.class); | ||
| private float ratio; | ||
| private long duration; | ||
|
|
||
| /** | ||
| * Corrupt network packages on a random regionserver. | ||
| * | ||
| * @param ratio the ratio of packages corrupted | ||
| * @param duration the time this issue persists in milliseconds | ||
| * @param timeout the timeout for executing required commands on the region server in milliseconds | ||
| * @param network network interface the regionserver uses for communication | ||
| */ | ||
| public CorruptPackagesCommandAction(float ratio, long duration, long timeout, String network) { | ||
joshelser marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| super(timeout, network); | ||
| this.ratio = ratio; | ||
| this.duration = duration; | ||
| } | ||
|
|
||
| protected void localPerform() throws IOException { | ||
| LOG.info("Starting to execute CorruptPackagesCommandAction"); | ||
| ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getCurrentServers()); | ||
| String hostname = server.getHostname(); | ||
|
|
||
| try { | ||
| clusterManager.execSudoWithRetries(hostname, timeout, getCommand(ADD)); | ||
| Thread.sleep(duration); | ||
| } catch (InterruptedException e) { | ||
| LOG.debug("Failed to run the command for the full duration", e); | ||
| } finally { | ||
| clusterManager.execSudoWithRetries(hostname, timeout, getCommand(DELETE)); | ||
| } | ||
|
|
||
| LOG.info("Finished to execute CorruptPackagesCommandAction"); | ||
| } | ||
|
|
||
| private String getCommand(String operation){ | ||
| return String.format("tc qdisc %s dev %s root netem corrupt %s%%", operation, network, | ||
| ratio * 100); | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.