Skip to content

Commit

Permalink
fix(pageserver): ensure GC computes time cutoff using the same start …
Browse files Browse the repository at this point in the history
…time (#10193)

## Problem

close #10192

## Summary of changes

* `find_gc_time_cutoff` takes `now` parameter so that all branches
compute the cutoff based on the same start time, avoiding races.
* gc-compaction uses a single `get_gc_compaction_watermark` function to
get the safe LSN to compact.

---------

Signed-off-by: Alex Chi Z <[email protected]>
Co-authored-by: Arpad Müller <[email protected]>
  • Loading branch information
skyzh and arpad-m authored Jan 6, 2025
1 parent 95f1920 commit 4a6556e
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 5 deletions.
6 changes: 5 additions & 1 deletion pageserver/src/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4488,13 +4488,17 @@ impl Tenant {
let mut gc_cutoffs: HashMap<TimelineId, GcCutoffs> =
HashMap::with_capacity(timelines.len());

// Ensures all timelines use the same start time when computing the time cutoff.
let now_ts_for_pitr_calc = SystemTime::now();
for timeline in timelines.iter() {
let cutoff = timeline
.get_last_record_lsn()
.checked_sub(horizon)
.unwrap_or(Lsn(0));

let cutoffs = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await?;
let cutoffs = timeline
.find_gc_cutoffs(now_ts_for_pitr_calc, cutoff, pitr, cancel, ctx)
.await?;
let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
assert!(old.is_none());
}
Expand Down
5 changes: 3 additions & 2 deletions pageserver/src/tenant/timeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4859,14 +4859,14 @@ impl Timeline {

async fn find_gc_time_cutoff(
&self,
now: SystemTime,
pitr: Duration,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<Option<Lsn>, PageReconstructError> {
debug_assert_current_span_has_tenant_and_timeline_id();
if self.shard_identity.is_shard_zero() {
// Shard Zero has SLRU data and can calculate the PITR time -> LSN mapping itself
let now = SystemTime::now();
let time_range = if pitr == Duration::ZERO {
humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid")
} else {
Expand Down Expand Up @@ -4952,6 +4952,7 @@ impl Timeline {
#[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
pub(super) async fn find_gc_cutoffs(
&self,
now: SystemTime,
space_cutoff: Lsn,
pitr: Duration,
cancel: &CancellationToken,
Expand Down Expand Up @@ -4979,7 +4980,7 @@ impl Timeline {
// - if PITR interval is set, then this is our cutoff.
// - if PITR interval is not set, then we do a lookup
// based on DEFAULT_PITR_INTERVAL, so that size-based retention does not result in keeping history around permanently on idle databases.
let time_cutoff = self.find_gc_time_cutoff(pitr, cancel, ctx).await?;
let time_cutoff = self.find_gc_time_cutoff(now, pitr, cancel, ctx).await?;

Ok(match (pitr, time_cutoff) {
(Duration::ZERO, Some(time_cutoff)) => {
Expand Down
22 changes: 20 additions & 2 deletions pageserver/src/tenant/timeline/compaction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1799,6 +1799,24 @@ impl Timeline {
Ok(())
}

/// Get a watermark for gc-compaction, that is the lowest LSN that we can use as the `gc_horizon` for
/// the compaction algorithm. It is min(space_cutoff, time_cutoff, latest_gc_cutoff, standby_horizon).
/// Leases and retain_lsns are considered in the gc-compaction job itself so we don't need to account for them
/// here.
pub(crate) fn get_gc_compaction_watermark(self: &Arc<Self>) -> Lsn {
let gc_cutoff_lsn = {
let gc_info = self.gc_info.read().unwrap();
gc_info.min_cutoff()
};

// TODO: standby horizon should use leases so we don't really need to consider it here.
// let watermark = watermark.min(self.standby_horizon.load());

// TODO: ensure the child branches will not use anything below the watermark, or consider
// them when computing the watermark.
gc_cutoff_lsn.min(*self.get_latest_gc_cutoff_lsn())
}

/// Split a gc-compaction job into multiple compaction jobs. The split is based on the key range and the estimated size of the compaction job.
/// The function returns a list of compaction jobs that can be executed separately. If the upper bound of the compact LSN
/// range is not specified, we will use the latest gc_cutoff as the upper bound, so that all jobs in the jobset acts
Expand All @@ -1811,7 +1829,7 @@ impl Timeline {
let compact_below_lsn = if job.compact_lsn_range.end != Lsn::MAX {
job.compact_lsn_range.end
} else {
*self.get_latest_gc_cutoff_lsn() // use the real gc cutoff
self.get_gc_compaction_watermark()
};

// Split compaction job to about 4GB each
Expand Down Expand Up @@ -2006,7 +2024,7 @@ impl Timeline {
// Therefore, it can only clean up data that cannot be cleaned up with legacy gc, instead of
// cleaning everything that theoritically it could. In the future, it should use `self.gc_info`
// to get the truth data.
let real_gc_cutoff = *self.get_latest_gc_cutoff_lsn();
let real_gc_cutoff = self.get_gc_compaction_watermark();
// The compaction algorithm will keep all keys above the gc_cutoff while keeping only necessary keys below the gc_cutoff for
// each of the retain_lsn. Therefore, if the user-provided `compact_lsn_range.end` is larger than the real gc cutoff, we will use
// the real cutoff.
Expand Down

1 comment on commit 4a6556e

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

7377 tests run: 7014 passed, 1 failed, 362 skipped (full report)


Failures on Postgres 16

  • test_storage_controller_many_tenants[github-actions-selfhosted]: release-x86-64
# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_storage_controller_many_tenants[release-pg16-github-actions-selfhosted]"

Code coverage* (full report)

  • functions: 31.2% (8411 of 26962 functions)
  • lines: 48.0% (66794 of 139233 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
4a6556e at 2025-01-06T21:35:57.666Z :recycle:

Please sign in to comment.