Skip to content
9 changes: 6 additions & 3 deletions pkg/kv/kvserver/api.proto
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,18 @@ message CollectChecksumRequest {
bytes checksum_id = 3 [(gogoproto.nullable) = false,
(gogoproto.customname) = "ChecksumID",
(gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID"];
bytes checksum = 4;
reserved 4;
// If true then the response must include the snapshot of the data from which
// the checksum is computed.
bool with_snapshot = 5;
}

message CollectChecksumResponse {
// The checksum is the sha512 hash of the requested computation. It is empty
// if the computation failed.
bytes checksum = 1;
// snapshot is set if the roachpb.ComputeChecksumRequest had snapshot = true
// and the response checksum is different from the request checksum.
// snapshot is set if the with_snapshot in CollectChecksumRequest is true. For
// example, it can be set by the caller when it has detected an inconsistency.
//
// TODO(tschottdorf): with larger ranges, this is no longer tenable.
// See https://github.com/cockroachdb/cockroach/issues/21128.
Expand Down
9 changes: 5 additions & 4 deletions pkg/kv/kvserver/consistency_queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,9 @@ const consistencyCheckRateMinWait = 100 * time.Millisecond
// replication factor of 7 could run 7 concurrent checks on every node.
//
// Note that checksum calculations below Raft are not tied to the caller's
// context (especially on followers), and will continue to run even after the
// caller has given up on them, which may cause them to build up.
// context, and may continue to run even after the caller has given up on them,
// which may cause them to build up. Although we do best effort to cancel the
// running task on the receiving end when the incoming request is aborted.
//
// CHECK_STATS checks do not count towards this limit, as they are cheap and the
// DistSender will parallelize them across all ranges (notably when calling
Expand All @@ -76,8 +77,8 @@ const consistencyCheckAsyncConcurrency = 7

// consistencyCheckAsyncTimeout is a below-Raft timeout for asynchronous
// consistency check calculations. These are not tied to the caller's context,
// and thus will continue to run even after the caller has given up on them, so
// we give them an upper timeout to prevent them from running forever.
// and thus may continue to run even if the caller has given up on them, so we
// give them an upper timeout to prevent them from running forever.
const consistencyCheckAsyncTimeout = time.Hour

var testingAggressiveConsistencyChecks = envutil.EnvOrDefaultBool("COCKROACH_CONSISTENCY_AGGRESSIVE", false)
Expand Down
2 changes: 1 addition & 1 deletion pkg/kv/kvserver/replica.go
Original file line number Diff line number Diff line change
Expand Up @@ -552,7 +552,7 @@ type Replica struct {
lastUpdateTimes lastUpdateTimesMap

// Computed checksum at a snapshot UUID.
checksums map[uuid.UUID]replicaChecksum
checksums map[uuid.UUID]*replicaChecksum

// proposalQuota is the quota pool maintained by the lease holder where
// incoming writes acquire quota from a fixed quota pool before going
Expand Down
8 changes: 7 additions & 1 deletion pkg/kv/kvserver/replica_application_result.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ import (
"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/stop"
"github.com/cockroachdb/errors"
)

// replica_application_*.go files provide concrete implementations of
Expand Down Expand Up @@ -334,7 +336,11 @@ func (r *Replica) handleVersionResult(ctx context.Context, version *roachpb.Vers
}

func (r *Replica) handleComputeChecksumResult(ctx context.Context, cc *kvserverpb.ComputeChecksum) {
r.computeChecksumPostApply(ctx, *cc)
err := r.computeChecksumPostApply(ctx, *cc)
// Don't log errors caused by the store quiescing, they are expected.
if err != nil && !errors.Is(err, stop.ErrUnavailable) {
log.Errorf(ctx, "failed to start ComputeChecksum task %s: %v", cc.ChecksumID, err)
}
}

func (r *Replica) handleChangeReplicasResult(
Expand Down
Loading