Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java
Original file line number Diff line number Diff line change
Expand Up @@ -1620,6 +1620,17 @@ public enum OperationStatusCode {
*/
public static final int BATCH_ROWS_THRESHOLD_DEFAULT = 5000;

/**
* when zookeeper data does not exist on master during meta bootstrap, default to remove the
* meta table directory that is considered as partial meta.
*
* TODO we can remove this feature if we come up a way to define partial meta on during bootstrap
* cluster that does not come with Zookeeper data
*/
public static final String REMOVE_META_ON_RESTART = "hbase.master.remove.meta.on.restart";
public static final boolean DEFAULT_REMOVE_META_ON_RESTART = true;


private HConstants() {
// Can't be instantiated with this ctor.
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1078,6 +1078,7 @@ private void finishActiveMasterInitialization(MonitoredTask status)
// complete before we do this next step processing offline regions else it fails reading
// table states messing up master launch (namespace table, etc., are not assigned).
this.assignmentManager.processOfflineRegions();
this.assignmentManager.processRegionsOnUnknownServers();
// Initialize after meta is up as below scans meta
if (favoredNodesManager != null && !maintenanceMode) {
SnapshotOfRegionAssignmentFromMeta snapshotOfRegionAssignment =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1441,6 +1441,35 @@ public void processOfflineRegions() {
}
}

/**
* Create assign procedure for non-offline regions of enabled table that are assigned
* to `unknown` servers after hbase:meta is online.
*
* This is a special case when WAL directory, SCP WALs and ZK data are cleared,
* cluster restarts with hbase:meta table and other tables with storefiles.
*/
public void processRegionsOnUnknownServers() {
List<RegionInfo> regionsOnUnknownServers = regionStates.getRegionStates().stream()
.filter(s -> !s.isOffline())
.filter(s -> isTableEnabled(s.getRegion().getTable()))
.filter(s -> !regionStates.isRegionInTransition(s.getRegion()))
.filter(s -> {
ServerName serverName = regionStates.getRegionServerOfRegion(s.getRegion());
if (serverName == null) {
return false;
}
return master.getServerManager().isServerKnownAndOnline(serverName)
.equals(ServerManager.ServerLiveState.UNKNOWN);
Comment on lines +1453 to +1462
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Collapse these down into one method so we don't end up making 4 iterations over a list of (potentially) a lot of regions.

})
.map(RegionState::getRegion).collect(Collectors.toList());
if (!regionsOnUnknownServers.isEmpty()) {
LOG.info("Found regions {} on unknown servers, reassign them to online servers",
regionsOnUnknownServers);
master.getMasterProcedureExecutor().submitProcedures(
master.getAssignmentManager().createRoundRobinAssignProcedures(regionsOnUnknownServers));
}
}

/* AM internal RegionStateStore.RegionStateVisitor implementation. To be used when
* scanning META table for region rows, using RegionStateStore utility methods. RegionStateStore
* methods will convert Result into proper RegionInfo instances, but those would still need to be
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,11 @@ private static void writeFsLayout(Path rootDir, Configuration conf) throws IOExc
LOG.info("BOOTSTRAP: creating hbase:meta region");
FileSystem fs = rootDir.getFileSystem(conf);
Path tableDir = CommonFSUtils.getTableDir(rootDir, TableName.META_TABLE_NAME);
if (fs.exists(tableDir) && !fs.delete(tableDir, true)) {
boolean removeMeta = conf.getBoolean(HConstants.REMOVE_META_ON_RESTART,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

// Checking if meta needs initializing.
status.setStatus("Initializing meta table if this is a new deploy");
InitMetaProcedure initMetaProc = null;
// Print out state of hbase:meta on startup; helps debugging.
RegionState rs = this.assignmentManager.getRegionStates().
getRegionState(RegionInfoBuilder.FIRST_META_REGIONINFO);
LOG.info("hbase:meta {}", rs);
if (rs != null && rs.isOffline()) {
Optional optProc = procedureExecutor.getProcedures().stream()
.filter(p -> p instanceof InitMetaProcedure).map(o -> (InitMetaProcedure) o).findAny();
initMetaProc = optProc.orElseGet(() -> {
// schedule an init meta procedure if meta has not been deployed yet
InitMetaProcedure temp = new InitMetaProcedure();
procedureExecutor.submitProcedure(temp);
return temp;
});
}
So the checks we do see that META location is not there in zk and so it thinks its new deploy. So here is what we need to tackle.
In cloud redeploy case we will see a pattern where we will have a clusterId in the FS and not in zk. This can be used as an indicator? IMO we should find it (using this way or other) that its a redeploy on an existing datset and all these places, we need to consider that also to decide we need such bootstrap steps.
We should not be doing that with a config way IMO. Because then in cloud based deploy, what if the 1st time start fail and there is a need for this bootstrap cleaning of META FS dir?
Even the other unknown server case also. Lets identify clearly this redeploy case and act then only.
Can we pls ave that discuss and conclude on a solution for that and then move forward?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In cloud redeploy case we will see a pattern where we will have a clusterId in the FS and not in zk. This can be used as an indicator?

I will come back on this later tomorrow, but I agreed with you that we should check explicitly how we define partial bootstrap and that partial meta need some cleanup.

also, do you mean if the clusterID did't write to ZK, is it partial during bootstrap ?

Copy link
Contributor

@anoopsjohn anoopsjohn Jul 28, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also, do you mean if the clusterID did't write to ZK, is it partial during bootstrap ?

I am not sure whether that can be really used. I need to check the code. We need a way to identify the fact that its a cluster redeploy. Not use some config to identify that.. The HBase system should be smart enough. So I was just wondering whether this we can use to know that. May be not.. Need to see. So my thinking is this that we will make the feature of recreate a cluster on top of existing data a 1st class feature for HBase itself.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also, do you mean if the clusterID did't write to ZK, is it partial during bootstrap ?

I am not sure whether that can be really used. I need to check the code. We need a way to identify the fact that its a cluster redeploy. Not use some config to identify that.. The HBase system should be smart enough. So I was just wondering whether this we can use to know that. May be not.. Need to see. So my thinking is this that we will make the feature of recreate a cluster on top of existing data a 1st class feature for HBase itself.

That would be great, let's find a good way to differentiate this case.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry for late and I have reread the code and come up the following.

First of all, the partial meta in current logic should mean a Procedure WAL of InitMetaProcedure did not succeed and INIT_META_ASSIGN_META was not completed. Currently, even if meta table can be read and a Table Descriptor can be retrieved but not assigned, it is still considered to be partial (correct me if I'm wrong). So, in short, partial meta table cannot be defined by reading the tableinfo or storefile itself.

Further, a combination of looking at WALs, Procedure WALs and Zookeeper data are the requirement and are used to define partial meta in the normal cases. But for the cloud use case, or other use cases that one of the requirements is missing, we will need a different discussion. For example.

  1. partial meta on the HDFS long running cluster cases
    a. if have WALs and have ZK, it will be able to reassign normally
    b. if have WALs but no ZK, it will not submit a new/enter into any state of InitMetaProcedure because it found the old InitMetaProcedure in the WAL. then the old server was handled by submit any SCP and assignment manager is do nothing. such Master hangs and does not finish initialization. (this is a different problem from the cloud case)
    c. if no WALs but have ZK, state=OPEN remains for hbase:meta when opening an existing meta region, InitMetaProcedure will not be submitted/entered as well (see this section in HMaster). master will hang and does not finish initialization. (this is a different problem from the cloud case)

There, for this PR, if we only focus on the cloud use cases, the unknown servers and partial meta will be much simpler. e.g. to when running InitMetaProcedure, clusterID in zookeeper (suggested by Anoop) can be used to indicate if it's partial meta that indicates ZK data is fresh, Region WALs and procedure WAL of InitMetaProcedure may not be exist. And if WAL and procedure WAL exits, it fails into the same failures as mentioned above case 1b (out of scope for this PR).

  1. partial meta on Cloud without WALs and ZK
    a. if we're in INIT_META_WRITE_FS_LAYOUT and continue, then ZK should have existed when master restarts. Otherwise for the case of have WALs and no ZK, we will fail back to case 1b and we don't handle it within this PR.
    b. if no WAL and no ZK, it submits a InitMetaProcedure but the procedure lands with INIT_META_WRITE_FS_LAYOUT
    • during INIT_META_WRITE_FS_LAYOUT, we check if ZK does not exist and with an existing meta directory, we should trust it and try to open it.
      • we're running this state of INIT_META_WRITE_FS_LAYOUT only when ZK does not exist or INIT_META_WRITE_FS_LAYOUT didn't finish previously.

So, we're fixing case 2b in this PR, and I have come up the prototype and unit tests are running off this PR now (TestClusterRestartFailoverSplitWithoutZk is falling even without our changes on branch-2).

The proposed changes are

  • Only perform regions reassignment for regions on unknown server when there is no PE WALs, no Region WALs and no ZK data
  • Do not recreate meta table directory if the restarted procedure of InitMetaProcedure#INIT_META_WRITE_FS_LAYOUT comes with no ZK data (or maybe no WAL as well).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for the delay. Need to read through the analysis what you put above.. What you mention about when, after recreate cluster, the start will hang because of META not getting assigned, is correct. Can u pls create a sub issue for this case? ie. knowing whether we are starting HM after a recreate (create cluster over existing data)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In case of HM start and the bootstrap we create the ClusterID and write to FS and then to zk and then create the META table FS layout. So in a cluster recreate, we will see clusterID is there in FS and also the META FS layout but no clusterID in zk. Ya seems we can use this as indication for cluster recreate over existing data. In HM start, this is some thing we need to check at 1st itself and track. If this mode is true, later when (if) we do INIT_META_WRITE_FS_LAYOUT , we should not delete the META dir. As part of the Bootstrap when we write that proc to MasterProcWal, we can include this mode (boolean) info also. This is a protobuf message anyways. So even if this HM got killed and restarted (at a point where the clusterId was written to zk but the Meta FS layout part was not reached) we can use the info added as part of the bootstrap wal entry and make sure NOT to delete the meta dir.
Can we do this part alone in a sub task and a provide a patch pls? This is very key part.. That is why better we can fine tune this with all diff testcases. Sounds good?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sounds right to me, as you suggested we put this PR on-held and depends on the new sub-task. I will try to send another JIRA and PR out in a few days and refer to the conversation we discussed here.

Thanks again Anoop

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should not delete the META dir.

Sorry for harping on an implementation detail: let's sideline meta and not delete please :).

Can we do this part alone in a sub task and a provide a patch pls? This is very key part..

This seems like a very reasonable starting point. Like Anoop points out, if we can be very sure that we will only trigger this case when we are absolutely sure we're in the "cloud recreate" situation, that will bring a lot of confidence.

I will try to send another JIRA and PR out in a few days and refer to the conversation we discussed here.

Lazy Josh: did you get a new Jira created already for this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@joshelser the new JIRA is HBASE-24833 and the discussion are mainly on the new PR#2237. I may need to send email to dev@ list for a boarder discussion if we should not depend on the data on zookeeper (that will help us to prevent deleting the meta directory)

HConstants.DEFAULT_REMOVE_META_ON_RESTART);
// we use zookeeper data to tell if this is a partial created meta, if so we should delete
// and recreate the meta table.
if (removeMeta && fs.exists(tableDir) && !fs.delete(tableDir, true)) {
LOG.warn("Can not delete partial created meta table, continue...");
}
// Bootstrapping, make sure blockcache is off. Else, one will be
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.master;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

import java.io.IOException;
import java.time.Duration;
import java.util.List;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.MiniHBaseCluster;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.master.region.MasterRegionFactory;
import org.apache.hadoop.hbase.procedure2.store.wal.WALProcedureStore;
import org.apache.hadoop.hbase.regionserver.HRegionServer;
import org.apache.hadoop.hbase.testclassification.LargeTests;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.CommonFSUtils;
import org.apache.hadoop.hbase.zookeeper.ZKUtil;

import org.junit.Before;
import org.junit.ClassRule;
import org.junit.Rule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.junit.rules.TestName;

/**
* Test reuse storefiles within data directory when cluster failover with a set of new region
* servers with different hostnames with or without WALs and Zookeeper ZNodes support. For any
* hbase system table and user table can be assigned normally after cluster restart.
*/
@Category({ LargeTests.class })
public class TestRecreateCluster {
@ClassRule
public static final HBaseClassTestRule CLASS_RULE =
HBaseClassTestRule.forClass(TestRecreateCluster.class);

@Rule
public TestName name = new TestName();

private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
private static final int NUM_RS = 3;
private static final long TIMEOUT_MS = Duration.ofMinutes(2).toMillis();

@Before
public void setup() {
TEST_UTIL.getConfiguration().setBoolean(HConstants.REMOVE_META_ON_RESTART, false);
}

@Test
public void testRecreateCluster_UserTableDisabled() throws Exception {
TEST_UTIL.startMiniCluster(NUM_RS);
try {
TableName tableName = TableName.valueOf("t1");
prepareDataBeforeRecreate(TEST_UTIL, tableName);
TEST_UTIL.getAdmin().disableTable(tableName);
TEST_UTIL.waitTableDisabled(tableName.getName());
restartHBaseCluster(true);
TEST_UTIL.getAdmin().enableTable(tableName);
validateDataAfterRecreate(TEST_UTIL, tableName);
} finally {
TEST_UTIL.shutdownMiniCluster();
}
}

@Test
public void testRecreateCluster_UserTableEnabled() throws Exception {
validateRecreateClusterWithUserTableEnabled(true);
}

@Test
public void testRecreateCluster_UserTableEnabled_WithoutCleanupWALsAndZNodes() throws Exception {
TEST_UTIL.getConfiguration().setBoolean(HConstants.REMOVE_META_ON_RESTART,
HConstants.DEFAULT_REMOVE_META_ON_RESTART);
validateRecreateClusterWithUserTableEnabled(false);
}

private void validateRecreateClusterWithUserTableEnabled(boolean cleanupWALsAndZNodes)
throws Exception {
TEST_UTIL.startMiniCluster(NUM_RS);
try {
TableName tableName = TableName.valueOf("t1");
prepareDataBeforeRecreate(TEST_UTIL, tableName);
restartHBaseCluster(cleanupWALsAndZNodes);
validateDataAfterRecreate(TEST_UTIL, tableName);
} finally {
TEST_UTIL.shutdownMiniCluster();
}
}

private void restartHBaseCluster(boolean cleanUpWALsAndZNodes) throws Exception {
// flush cache so that everything is on disk
TEST_UTIL.getMiniHBaseCluster().flushcache();

List<ServerName> oldServers =
TEST_UTIL.getHBaseCluster().getMaster().getServerManager().getOnlineServersList();

// make sure there is no procedures pending
TEST_UTIL.waitFor(TIMEOUT_MS, () -> TEST_UTIL.getHBaseCluster().getMaster()
.getProcedures().stream().filter(p -> p.isFinished()).findAny().isPresent());

// shutdown and delete data if needed
Path walRootDirPath = TEST_UTIL.getMiniHBaseCluster().getMaster().getWALRootDir();
Path rootDirPath = CommonFSUtils.getRootDir(TEST_UTIL.getConfiguration());
TEST_UTIL.shutdownMiniHBaseCluster();

if (cleanUpWALsAndZNodes) {
TEST_UTIL.getDFSCluster().getFileSystem()
.delete(new Path(rootDirPath, MasterRegionFactory.MASTER_STORE_DIR), true);
TEST_UTIL.getDFSCluster().getFileSystem()
.delete(new Path(walRootDirPath, MasterRegionFactory.MASTER_STORE_DIR), true);
TEST_UTIL.getDFSCluster().getFileSystem()
.delete(new Path(walRootDirPath, WALProcedureStore.MASTER_PROCEDURE_LOGDIR), true);

TEST_UTIL.getDFSCluster().getFileSystem()
.delete(new Path(walRootDirPath, HConstants.HREGION_LOGDIR_NAME), true);
TEST_UTIL.getDFSCluster().getFileSystem()
.delete(new Path(walRootDirPath, HConstants.HREGION_OLDLOGDIR_NAME), true);
// delete all zk data
// we cannot keep ZK data because it will hold the meta region states as open and
// didn't submit a InitMetaProcedure
ZKUtil.deleteChildrenRecursively(TEST_UTIL.getZooKeeperWatcher(),
TEST_UTIL.getZooKeeperWatcher().getZNodePaths().baseZNode);
TEST_UTIL.shutdownMiniZKCluster();
TEST_UTIL.startMiniZKCluster();
}

TEST_UTIL.restartHBaseCluster(NUM_RS);
TEST_UTIL.waitFor(TIMEOUT_MS,
() -> TEST_UTIL.getMiniHBaseCluster().getNumLiveRegionServers() == NUM_RS);

// make sure we have a new set of region servers with different hostnames and ports
List<ServerName> newServers =
TEST_UTIL.getHBaseCluster().getMaster().getServerManager().getOnlineServersList();
assertFalse(newServers.stream().filter(newServer -> oldServers.contains(newServer)).findAny()
.isPresent());
}

private void prepareDataBeforeRecreate(
HBaseTestingUtility testUtil, TableName tableName) throws Exception {
Table table = testUtil.createTable(tableName, "f");
Put put = new Put(Bytes.toBytes("r1"));
put.addColumn(Bytes.toBytes("f"), Bytes.toBytes("c"), Bytes.toBytes("v"));
table.put(put);

ensureTableNotColocatedWithSystemTable(tableName, TableName.NAMESPACE_TABLE_NAME);
}

private void ensureTableNotColocatedWithSystemTable(TableName userTable, TableName systemTable)
throws IOException, InterruptedException {
MiniHBaseCluster hbaseCluster = TEST_UTIL.getHBaseCluster();
assertTrue("Please start more than 1 regionserver",
hbaseCluster.getRegionServerThreads().size() > 1);

int userTableServerNum = getServerNumForTableWithOnlyOneRegion(userTable);
int systemTableServerNum = getServerNumForTableWithOnlyOneRegion(systemTable);

if (userTableServerNum != systemTableServerNum) {
// no-ops if user table and system are already on a different host
return;
}

int destServerNum = (systemTableServerNum + 1) % NUM_RS;
assertTrue(systemTableServerNum != destServerNum);

HRegionServer systemTableServer = hbaseCluster.getRegionServer(systemTableServerNum);
HRegionServer destServer = hbaseCluster.getRegionServer(destServerNum);
assertTrue(!systemTableServer.equals(destServer));
// make sure the dest server is live before moving region
hbaseCluster.waitForRegionServerToStart(destServer.getServerName().getHostname(),
destServer.getServerName().getPort(), TIMEOUT_MS);
// move region of userTable to a different regionserver not co-located with system table
TEST_UTIL.moveRegionAndWait(TEST_UTIL.getAdmin().getRegions(userTable).get(0),
destServer.getServerName());
}

private int getServerNumForTableWithOnlyOneRegion(TableName tableName) throws IOException {
List<RegionInfo> tableRegionInfos = TEST_UTIL.getAdmin().getRegions(tableName);
assertEquals(1, tableRegionInfos.size());
return TEST_UTIL.getHBaseCluster()
.getServerWith(tableRegionInfos.get(0).getRegionName());
}

private void validateDataAfterRecreate(
HBaseTestingUtility testUtil, TableName tableName) throws Exception {
Table t1 = testUtil.getConnection().getTable(tableName);
Get get = new Get(Bytes.toBytes("r1"));
get.addColumn(Bytes.toBytes("f"), Bytes.toBytes("c"));
Result result = t1.get(get);
assertTrue(result.advance());
Cell cell = result.current();
assertEquals("v", Bytes.toString(cell.getValueArray(),
cell.getValueOffset(), cell.getValueLength()));
assertFalse(result.advance());
}

}