Skip to content
72 changes: 72 additions & 0 deletions app/jobs/identities_backfill_job.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
class IdentitiesBackfillJob < ApplicationJob
# This is a short-term solution to backfill data requiring a table scan.
# This job can be deleted once it's done.

queue_as :long_running

# Let's give us the option to fine-tune this on the fly
BATCH_SIZE_KEY = 'IdentitiesBackfillJob.batch_size'.freeze
SLICE_SIZE_KEY = 'IdentitiesBackfillJob.slice_size'.freeze
CACHE_KEY = 'IdentitiesBackfillJob.position'.freeze

def perform
start_time = Time.zone.now
max_id = ServiceProviderIdentity.last.id

if position > max_id
logger.info "backfill reached end; skipping. position=#{position} max_id=#{max_id}"
return
end

last_id = position

(batch_size / slice_size).times.each do |slice_num|
start_id = position + (slice_size * slice_num)
next if start_id > max_id

params = {
min_id: start_id,
max_id: start_id + slice_size,
}.transform_values { |v| ActiveRecord::Base.connection.quote(v) }
sp_query = format(<<~SQL, params)
UPDATE identities
SET last_consented_at = created_at
WHERE id > %{min_id}
AND id <= %{max_id}
AND deleted_at IS NULL
AND last_consented_at IS NULL
RETURNING id
SQL

result = ActiveRecord::Base.connection.execute(sp_query)

last_id = result.to_a&.first ? result.to_a.first['id'] : start_id + slice_size

logger.info "Processed rows #{start_id} through #{last_id} (updated=#{result.ntuples})"
end

elapsed_time = Time.zone.now - start_time
logger.info(
"Finished a full batch (#{batch_size} rows to id=#{last_id}) in #{elapsed_time} seconds",
)
Comment on lines +48 to +51
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we combine this log message with the one above? that way we only need one log line per job run to check.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We discussed this in chat, but for the logs here: the previous log entry is inside the per-slice loop, while this is at the end of the whole batch. We concluded we'd keep both.


# If we made it here without error, increment the counter for next time:
REDIS_POOL.with { |redis| redis.set(CACHE_KEY, last_id) }
end

def position
redis_get(CACHE_KEY, 0)
end

def batch_size
redis_get(BATCH_SIZE_KEY, 500_000)
end

def slice_size
redis_get(SLICE_SIZE_KEY, 10_000)
end

def redis_get(key, default)
(REDIS_POOL.with { |redis| redis.get(key) } || default).to_i
end
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I pulled this out into its own method because I found a bug in how I was doing this initially and figured I'd move it all to one place and fix it.

I was doing redis.get(key).to_i || default and realized that redis.get(DOES_NOT_EXIST) returns nil, so this would always return 0.

end
5 changes: 5 additions & 0 deletions config/initializers/job_configurations.rb
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,11 @@
cron: cron_1h,
args: -> { [Time.zone.now - 1.hour] },
},
# Backfill service_provider_identities last_consented_at
identities_backfill_job: {
class: 'IdentitiesBackfillJob',
cron: cron_5m,
},
}
end
# rubocop:enable Metrics/BlockLength
Expand Down
8 changes: 8 additions & 0 deletions spec/factories/service_provider_identities.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,12 @@
trait :active do
last_authenticated_at { Time.zone.now }
end

trait :consented do
last_consented_at { Time.zone.now - 5.minutes }
end
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually think we probably want to make :active set last_consented_at, but maybe after this story.


trait :soft_deleted_5m_ago do
deleted_at { Time.zone.now - 5.minutes }
end
end
107 changes: 107 additions & 0 deletions spec/jobs/identities_backfill_job_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
require 'rails_helper'

RSpec.describe IdentitiesBackfillJob, type: :job do
describe '#perform' do
let!(:deleted) do
create(:service_provider_identity, :soft_deleted_5m_ago, :consented)
end

let!(:consented_at_set) do
create(:service_provider_identity, :consented)
end

let!(:no_consented_at) do
create(:service_provider_identity, :active)
end

subject { described_class.perform_now }

it 'does not update rows that have been soft-deleted' do
expect(deleted.deleted_at).to_not be_nil
consent_time = deleted.last_consented_at.to_i

subject
deleted.reload

expect(deleted.deleted_at).not_to be_nil
expect(deleted.last_consented_at.to_i).to eq(consent_time)
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussed in Slack, but the to_i is necessary here because otherwise we run into weird precision issues, where the DB read vs. in-memory representation seem to have a different resolution and this caused the test to fail over whether or not we show nanoseconds.

The factory sets deleted_at to "5 minutes ago" so there will be a distinct difference if the record inadvertently gets updated.

end

it 'does not update rows that already have a date populated' do
time = consented_at_set.last_consented_at
expect(time).not_to be_nil

subject

expect(consented_at_set.reload.last_consented_at.to_i).to eq time.to_i
end

it 'does update rows which are not deleted and have no last_consented_at date' do
expect(no_consented_at.last_consented_at).to be_nil
expect(no_consented_at.created_at).to_not be_nil
expect(no_consented_at.deleted_at).to be_nil

subject

expect(no_consented_at.reload.last_consented_at).to_not be_nil
end

context 'when the batch size is small' do
let(:batch_size) { 2 }
let(:slice_size) { 1 }
before do
REDIS_POOL.with do |redis|
redis.set(IdentitiesBackfillJob::BATCH_SIZE_KEY, batch_size)
redis.set(IdentitiesBackfillJob::SLICE_SIZE_KEY, slice_size)
redis.set(
IdentitiesBackfillJob::CACHE_KEY,
ServiceProviderIdentity.first.id - 1,
)
end
end

it 'increments the position properly' do
# The first time around, the position should match the second row:
described_class.perform_now
position = described_class.new.position
expect(position).to eq(ServiceProviderIdentity.second.id)

# The second time, we finish the table, so position should point past the end:
described_class.perform_now
expect(described_class.new.position).to eq(position + batch_size)
end
end

context 'when done updating the database' do
it 'updates the position in Redis' do
expect(described_class.new.position).to eq(0)

subject

expect(described_class.new.position).to eq(ServiceProviderIdentity.last.id)
end
end
end

describe '#batch_size' do
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Arguably this and .position (etc.) shouldn't even be public methods, but because this is a short-lived script and I had a bug in an early draft of this, I just left them public and added a test.

subject { described_class.new.batch_size }

context 'when there is no cache key in redis' do
it 'returns the default of 500,000' do
expect(subject).to eq(500_000)
end
end

context 'when there is a cache key in redis' do
before do
REDIS_POOL.with do |redis|
redis.set(described_class::BATCH_SIZE_KEY, 1000)
end
end

it 'returns that value' do
expect(subject).to eq 1000
end
end
end
end