-
Notifications
You must be signed in to change notification settings - Fork 3.4k
HBASE-25902 Add missing CFs in meta during HBase 1 to 2 Upgrade #3417
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,34 @@ | ||
| /* | ||
| * | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.hadoop.hbase; | ||
|
|
||
| import org.apache.yetus.audience.InterfaceAudience; | ||
|
|
||
| /** | ||
| * Thrown if the master requires restart. | ||
| */ | ||
| @InterfaceAudience.Public | ||
| public class PleaseRestartMasterException extends HBaseIOException { | ||
|
|
||
| public PleaseRestartMasterException(final String s) { | ||
| super(s); | ||
| } | ||
|
|
||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -70,6 +70,7 @@ | |
| import org.apache.hadoop.hbase.MasterNotRunningException; | ||
| import org.apache.hadoop.hbase.NamespaceDescriptor; | ||
| import org.apache.hadoop.hbase.PleaseHoldException; | ||
| import org.apache.hadoop.hbase.PleaseRestartMasterException; | ||
| import org.apache.hadoop.hbase.RegionMetrics; | ||
| import org.apache.hadoop.hbase.ReplicationPeerNotFoundException; | ||
| import org.apache.hadoop.hbase.ServerMetrics; | ||
|
|
@@ -175,6 +176,7 @@ | |
| import org.apache.hadoop.hbase.quotas.SpaceQuotaSnapshotNotifierFactory; | ||
| import org.apache.hadoop.hbase.quotas.SpaceViolationPolicy; | ||
| import org.apache.hadoop.hbase.regionserver.HRegionServer; | ||
| import org.apache.hadoop.hbase.regionserver.NoSuchColumnFamilyException; | ||
| import org.apache.hadoop.hbase.regionserver.RSRpcServices; | ||
| import org.apache.hadoop.hbase.replication.ReplicationException; | ||
| import org.apache.hadoop.hbase.replication.ReplicationLoadSource; | ||
|
|
@@ -191,6 +193,7 @@ | |
| import org.apache.hadoop.hbase.trace.TraceUtil; | ||
| import org.apache.hadoop.hbase.util.Addressing; | ||
| import org.apache.hadoop.hbase.util.Bytes; | ||
| import org.apache.hadoop.hbase.util.FSTableDescriptors; | ||
| import org.apache.hadoop.hbase.util.HBaseFsck; | ||
| import org.apache.hadoop.hbase.util.HFileArchiveUtil; | ||
| import org.apache.hadoop.hbase.util.IdLock; | ||
|
|
@@ -953,9 +956,26 @@ private void finishActiveMasterInitialization(MonitoredTask status) | |
| if (!waitForMetaOnline()) { | ||
| return; | ||
| } | ||
| TableDescriptor metaDescriptor = tableDescriptors.get( | ||
| TableName.META_TABLE_NAME); | ||
| final ColumnFamilyDescriptor tableFamilyDesc = metaDescriptor | ||
| .getColumnFamily(HConstants.TABLE_FAMILY); | ||
| final ColumnFamilyDescriptor replBarrierFamilyDesc = | ||
| metaDescriptor.getColumnFamily(HConstants.REPLICATION_BARRIER_FAMILY); | ||
|
|
||
| this.assignmentManager.joinCluster(); | ||
| // The below depends on hbase:meta being online. | ||
| this.tableStateManager.start(); | ||
| try { | ||
| this.tableStateManager.start(); | ||
| } catch (NoSuchColumnFamilyException e) { | ||
| if (tableFamilyDesc == null && replBarrierFamilyDesc == null) { | ||
| LOG.info("TableStates manager could not be started. This is expected" | ||
| + " during HBase 1 to 2 upgrade.", e); | ||
| } else { | ||
| throw e; | ||
| } | ||
| } | ||
|
|
||
| this.assignmentManager.processOfflineRegions(); | ||
| // this must be called after the above processOfflineRegions to prevent race | ||
| this.assignmentManager.wakeMetaLoadedEvent(); | ||
|
|
@@ -1025,7 +1045,17 @@ private void finishActiveMasterInitialization(MonitoredTask status) | |
| return; | ||
| } | ||
| status.setStatus("Starting cluster schema service"); | ||
| initClusterSchemaService(); | ||
| try { | ||
| initClusterSchemaService(); | ||
| } catch (IllegalStateException e) { | ||
| if (e.getCause() != null && e.getCause() instanceof NoSuchColumnFamilyException | ||
| && tableFamilyDesc == null && replBarrierFamilyDesc == null) { | ||
| LOG.info("ClusterSchema service could not be initialized. This is " | ||
| + "expected during HBase 1 to 2 upgrade", e); | ||
| } else { | ||
| throw e; | ||
| } | ||
| } | ||
|
|
||
| if (this.cpHost != null) { | ||
| try { | ||
|
|
@@ -1047,6 +1077,29 @@ private void finishActiveMasterInitialization(MonitoredTask status) | |
| // Set master as 'initialized'. | ||
| setInitialized(true); | ||
|
|
||
| if (tableFamilyDesc == null && replBarrierFamilyDesc == null) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In case one is not there also, need to create?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was thinking that too but for HBase 1 to 2.3+ upgrade, both will be null for sure so with this case, we are targeting this specific upgrade case. |
||
| // create missing CFs in meta table after master is set to 'initialized'. | ||
| createMissingCFsInMetaDuringUpgrade(metaDescriptor); | ||
|
|
||
| // Throwing this Exception to abort active master is painful but this | ||
| // seems the only way to add missing CFs in meta while upgrading from | ||
| // HBase 1 to 2 (where HBase 2 has HBASE-23055 & HBASE-23782 checked-in). | ||
| // So, why do we abort active master after adding missing CFs in meta? | ||
| // When we reach here, we would have already bypassed NoSuchColumnFamilyException | ||
| // in initClusterSchemaService(), meaning ClusterSchemaService is not | ||
| // correctly initialized but we bypassed it. Similarly, we bypassed | ||
| // tableStateManager.start() as well. Hence, we should better abort | ||
| // current active master because our main task - adding missing CFs | ||
| // in meta table is done (possible only after master state is set as | ||
| // initialized) at the expense of bypassing few important tasks as part | ||
| // of active master init routine. So now we abort active master so that | ||
| // next active master init will not face any issues and all mandatory | ||
| // services will be started during master init phase. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice note. |
||
| throw new PleaseRestartMasterException("Aborting active master after missing" | ||
| + " CFs are successfully added in meta. Subsequent active master " | ||
| + "initialization should be uninterrupted"); | ||
| } | ||
|
|
||
| if (maintenanceMode) { | ||
| LOG.info("Detected repair mode, skipping final initialization steps."); | ||
| return; | ||
|
|
@@ -1106,6 +1159,38 @@ private void finishActiveMasterInitialization(MonitoredTask status) | |
| } | ||
| } | ||
|
|
||
| private void createMissingCFsInMetaDuringUpgrade( | ||
| TableDescriptor metaDescriptor) throws IOException { | ||
| TableDescriptor newMetaDesc = | ||
| TableDescriptorBuilder.newBuilder(metaDescriptor) | ||
| .setColumnFamily(FSTableDescriptors.getTableFamilyDescForMeta(conf)) | ||
| .setColumnFamily(FSTableDescriptors.getReplBarrierFamilyDescForMeta()) | ||
| .build(); | ||
| long pid = this.modifyTable(TableName.META_TABLE_NAME, () -> newMetaDesc, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This procedure could be done without ClusterSchemaService?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, apparently this can be done without ClusterSchemaService as per my dev testing. |
||
| 0, 0, false); | ||
| int tries = 30; | ||
| while (!(getMasterProcedureExecutor().isFinished(pid)) | ||
| && getMasterProcedureExecutor().isRunning() && tries > 0) { | ||
| try { | ||
| Thread.sleep(1000); | ||
| } catch (InterruptedException e) { | ||
| throw new IOException("Wait interrupted", e); | ||
| } | ||
| tries--; | ||
| } | ||
| if (tries <= 0) { | ||
| throw new HBaseIOException( | ||
| "Failed to add table and rep_barrier CFs to meta in a given time."); | ||
| } else { | ||
| Procedure<?> result = getMasterProcedureExecutor().getResult(pid); | ||
| if (result != null && result.isFailed()) { | ||
| throw new IOException( | ||
| "Failed to add table and rep_barrier CFs to meta. " | ||
| + MasterProcedureUtil.unwrapRemoteIOException(result)); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Check hbase:meta is up and ready for reading. For use during Master startup only. | ||
| * @return True if meta is UP and online and startup can progress. Otherwise, meta is not online | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems 2.0.x had extra table state CF only. 2.1.x Had this replication barrier. When table state CF is missing it causes the startup issue right?
When its 2.0.x to 2.3.x upgrade, the repBarrier will get auto created?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This PR is for 2.3+ releases meaning if we come from HBase 1 to 2.3+, the transition should be seamless. Hence, for this upgrade case, both table and repl_barrier will be missing.
I have not tried this one. If table CF is handled, I guess repl_barrier too would have been handled. As per @saintstack's testing, the cluster had to come from 1.2 to earlier than 2.3 release and from that release, it went on to 2.3, hence I am assuming HBase 2 (< 2.3) to 2.3 upgrade should have been smooth. @saintstack thoughts?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've run various versions of hbase-2.0-hbase2.2.x upgrades successfully... It was when I tried to go from an hbase1.2 version straight to hbase2.3 that I ran into this issue.
Otherwise, agree w/ your thinking around repl_barrier and table CF. Is there a case that one might be in place but not the other -- I don't know-- and does the code do right thing(I've not checked)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Exactly, the case where either 'table' or 'repl_barrier' is missing might be weird case and should not happen given that HBase 2 upgrades are not going to face missing CF issues. That's why I kept
&&here instead of||to make sure we are specifically handling HBase 1 to 2.3+ upgrade case.FYI @anoopsjohn
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok. Good. Thanks.