Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
1bdef8a
fix: throw errors instead of ignoring
WenyXu Mar 28, 2025
a952131
fix: fix unit tests
WenyXu Mar 28, 2025
d670bf2
refactor: remove schema version check
WenyXu Mar 28, 2025
27a2017
fix: fix clippy
WenyXu Mar 28, 2025
aa018e3
chore: remove unused error
WenyXu Mar 28, 2025
2cd72f4
refactor: remove schema version check
WenyXu Mar 31, 2025
b3b89c6
feat: handle mutliple results
WenyXu Mar 31, 2025
ac9efe4
feat: introduce consistency guard
WenyXu Mar 31, 2025
d86de4d
fix: release consistency guard on datanode operation completion
WenyXu Mar 31, 2025
e0ff5e1
test: add tests
WenyXu Mar 31, 2025
2556411
Merge branch 'main' into fix/alter-table
WenyXu Mar 31, 2025
1d49541
chore: remove schema version
WenyXu Mar 31, 2025
edcc169
refactor: rename
WenyXu Mar 31, 2025
c92d385
test: add more tests
WenyXu Mar 31, 2025
15cad34
chore: print all error
WenyXu Apr 1, 2025
57208e5
tests: query table after alteration
WenyXu Apr 2, 2025
52325e8
log ignored request
WenyXu Apr 2, 2025
515a85b
refine fuzz test
WenyXu Apr 2, 2025
71b2529
chore: fix clippy and log mailbox message
WenyXu Apr 2, 2025
f73ddc5
chore: close prepared statement after execution
WenyXu Apr 2, 2025
0bca295
chore: add comment
WenyXu Apr 2, 2025
96b4e41
chore: remove log
WenyXu Apr 2, 2025
f0f19b4
chore: rename to `ConsistencyPoison`
WenyXu Apr 2, 2025
c61cf91
Merge branch 'main' into fix/alter-table
WenyXu Apr 7, 2025
2cd1de0
chore: remove unused error
WenyXu Apr 7, 2025
a6e0ccb
fix: fix unit tests
WenyXu Apr 7, 2025
f5b69f6
chore: apply suggestions from CR
WenyXu Apr 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 80 additions & 24 deletions src/common/meta/src/ddl/alter_table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,30 +22,31 @@ use std::vec;
use api::v1::alter_table_expr::Kind;
use api::v1::RenameTable;
use async_trait::async_trait;
use common_error::ext::ErrorExt;
use common_error::status_code::StatusCode;
use common_error::ext::BoxedError;
use common_procedure::error::{FromJsonSnafu, Result as ProcedureResult, ToJsonSnafu};
use common_procedure::{
Context as ProcedureContext, Error as ProcedureError, LockKey, Procedure, Status, StringKey,
Context as ProcedureContext, ContextProvider, Error as ProcedureError, LockKey, PoisonKey,
PoisonKeys, Procedure, ProcedureId, Status, StringKey,
};
use common_telemetry::{debug, error, info};
use futures::future;
use serde::{Deserialize, Serialize};
use snafu::ResultExt;
use snafu::{ensure, ResultExt};
use store_api::storage::RegionId;
use strum::AsRefStr;
use table::metadata::{RawTableInfo, TableId, TableInfo};
use table::table_reference::TableReference;

use crate::cache_invalidator::Context;
use crate::ddl::utils::add_peer_context_if_needed;
use crate::ddl::utils::{add_peer_context_if_needed, handle_multiple_results, MultipleResults};
use crate::ddl::DdlContext;
use crate::error::{Error, Result};
use crate::error::{AbortProcedureSnafu, Error, NoLeaderSnafu, PutPoisonSnafu, Result};
use crate::instruction::CacheIdent;
use crate::key::table_info::TableInfoValue;
use crate::key::{DeserializedValueWithBytes, RegionDistribution};
use crate::lock_key::{CatalogLock, SchemaLock, TableLock, TableNameLock};
use crate::metrics;
use crate::poison_key::table_poison_key;
use crate::rpc::ddl::AlterTableTask;
use crate::rpc::router::{find_leader_regions, find_leaders, region_distribution};

Expand Down Expand Up @@ -104,7 +105,27 @@ impl AlterTableProcedure {
Ok(Status::executing(true))
}

pub async fn submit_alter_region_requests(&mut self) -> Result<Status> {
fn table_poison_key(&self) -> PoisonKey {
table_poison_key(self.data.table_id())
}

async fn put_poison(
&self,
ctx_provider: &dyn ContextProvider,
procedure_id: ProcedureId,
) -> Result<()> {
let poison_key = self.table_poison_key();
ctx_provider
.try_put_poison(&poison_key, procedure_id)
.await
.context(PutPoisonSnafu)
}

pub async fn submit_alter_region_requests(
&mut self,
procedure_id: ProcedureId,
ctx_provider: &dyn ContextProvider,
) -> Result<Status> {
let table_id = self.data.table_id();
let (_, physical_table_route) = self
.context
Expand All @@ -127,6 +148,9 @@ impl AlterTableProcedure {
alter_kind,
);

ensure!(!leaders.is_empty(), NoLeaderSnafu { table_id });
// Puts the poison before submitting alter region requests to datanodes.
self.put_poison(ctx_provider, procedure_id).await?;
for datanode in leaders {
let requester = self.context.node_manager.datanode(&datanode).await;
let regions = find_leader_regions(&physical_table_route.region_routes, &datanode);
Expand All @@ -140,28 +164,51 @@ impl AlterTableProcedure {
let requester = requester.clone();

alter_region_tasks.push(async move {
if let Err(err) = requester.handle(request).await {
if err.status_code() != StatusCode::RequestOutdated {
// Treat request outdated as success.
// The engine will throw this code when the schema version not match.
// As this procedure has locked the table, the only reason for this error
// is procedure is succeeded before and is retrying.
return Err(add_peer_context_if_needed(datanode)(err));
}
}
Ok(())
requester
.handle(request)
.await
.map_err(add_peer_context_if_needed(datanode))
Comment thread
WenyXu marked this conversation as resolved.
});
}
}

future::join_all(alter_region_tasks)
let results = future::join_all(alter_region_tasks)
.await
.into_iter()
.collect::<Result<Vec<_>>>()?;

self.data.state = AlterTableState::UpdateMetadata;
.collect::<Vec<_>>();

Ok(Status::executing(true))
match handle_multiple_results(results) {
MultipleResults::PartialRetryable(error) => {
// Just returns the error, and wait for the next try.
Err(error)
}
MultipleResults::PartialNonRetryable(error) => {
error!(error; "Partial non-retryable errors occurred during alter table, table {}, table_id: {}", self.data.table_ref(), self.data.table_id());
// No retry will be done.
Ok(Status::poisoned(
Some(self.table_poison_key()),
ProcedureError::external(error),
))
}
MultipleResults::AllRetryable(error) => {
// Just returns the error, and wait for the next try.
Err(error)
}
MultipleResults::Ok => {
self.data.state = AlterTableState::UpdateMetadata;
Ok(Status::executing_with_clean_poisons(true))
}
MultipleResults::AllNonRetryable(error) => {
error!(error; "All alter requests returned non-retryable errors for table {}, table_id: {}", self.data.table_ref(), self.data.table_id());
// It assumes the metadata on datanode is not changed.
// Case: The alter region request is sent but not applied. (e.g., InvalidArgument)

let err = BoxedError::new(error);
Err(err).context(AbortProcedureSnafu {
clean_poisons: true,
})
}
}
}

/// Update table metadata.
Expand Down Expand Up @@ -250,10 +297,12 @@ impl Procedure for AlterTableProcedure {
Self::TYPE_NAME
}

async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult<Status> {
async fn execute(&mut self, ctx: &ProcedureContext) -> ProcedureResult<Status> {
let error_handler = |e: Error| {
if e.is_retry_later() {
ProcedureError::retry_later(e)
} else if e.need_clean_poisons() {
ProcedureError::external_and_clean_poisons(e)
} else {
ProcedureError::external(e)
}
Expand All @@ -269,7 +318,10 @@ impl Procedure for AlterTableProcedure {

match state {
AlterTableState::Prepare => self.on_prepare().await,
AlterTableState::SubmitAlterRegionRequests => self.submit_alter_region_requests().await,
AlterTableState::SubmitAlterRegionRequests => {
self.submit_alter_region_requests(ctx.procedure_id, ctx.provider.as_ref())
.await
}
AlterTableState::UpdateMetadata => self.on_update_metadata().await,
AlterTableState::InvalidateTableCache => self.on_broadcast().await,
}
Expand All @@ -285,6 +337,10 @@ impl Procedure for AlterTableProcedure {

LockKey::new(key)
}

fn poison_keys(&self) -> PoisonKeys {
PoisonKeys::new(vec![self.table_poison_key()])
}
}

#[derive(Debug, Serialize, Deserialize, AsRefStr)]
Expand Down
4 changes: 3 additions & 1 deletion src/common/meta/src/ddl/create_table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,9 @@ impl Procedure for CreateTableProcedure {
.creator
.register_opening_regions(&self.context, &x.region_routes)
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
.context(ExternalSnafu {
clean_poisons: false,
})?;
}

Ok(())
Expand Down
4 changes: 3 additions & 1 deletion src/common/meta/src/ddl/drop_database.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,9 @@ impl Procedure for DropDatabaseProcedure {
self.state
.recover(&self.runtime_context)
.map_err(BoxedError::new)
.context(ExternalSnafu)
.context(ExternalSnafu {
clean_poisons: false,
})
}

async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult<Status> {
Expand Down
4 changes: 3 additions & 1 deletion src/common/meta/src/ddl/drop_table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,9 @@ impl Procedure for DropTableProcedure {
if register_operating_regions {
self.register_dropping_regions()
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
.context(ExternalSnafu {
clean_poisons: false,
})?;
}

Ok(())
Expand Down
71 changes: 71 additions & 0 deletions src/common/meta/src/ddl/test_util/datanode_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -171,3 +171,74 @@ impl MockDatanodeHandler for NaiveDatanodeHandler {
unreachable!()
}
}

#[derive(Clone)]
pub struct PartialSuccessDatanodeHandler {
pub retryable: bool,
}

#[async_trait::async_trait]
impl MockDatanodeHandler for PartialSuccessDatanodeHandler {
async fn handle(&self, peer: &Peer, _request: RegionRequest) -> Result<RegionResponse> {
let success = peer.id % 2 == 0;
if success {
Ok(RegionResponse::new(0))
} else if self.retryable {
Err(Error::RetryLater {
source: BoxedError::new(
error::UnexpectedSnafu {
err_msg: "retry later",
}
.build(),
),
})
} else {
error::UnexpectedSnafu {
err_msg: "mock error",
}
.fail()
}
}

async fn handle_query(
&self,
_peer: &Peer,
_request: QueryRequest,
) -> Result<SendableRecordBatchStream> {
unreachable!()
}
}

#[derive(Clone)]
pub struct AllFailureDatanodeHandler {
pub retryable: bool,
}

#[async_trait::async_trait]
impl MockDatanodeHandler for AllFailureDatanodeHandler {
async fn handle(&self, _peer: &Peer, _request: RegionRequest) -> Result<RegionResponse> {
if self.retryable {
Err(Error::RetryLater {
source: BoxedError::new(
error::UnexpectedSnafu {
err_msg: "retry later",
}
.build(),
),
})
} else {
error::UnexpectedSnafu {
err_msg: "mock error",
}
.fail()
}
}

async fn handle_query(
&self,
_peer: &Peer,
_request: QueryRequest,
) -> Result<SendableRecordBatchStream> {
unreachable!()
}
}
Loading
Loading