From a09b21136a831008fdf1b56838cd6de9e2add984 Mon Sep 17 00:00:00 2001
From: adammilnesmith <adam@milne-smith.co.uk>
Date: Wed, 10 Jul 2024 17:11:59 +0100
Subject: [PATCH] Add DLQ for reprocessing of failed blocks

---
 .gitignore                |   5 +-
 client/duneapi/client.go  |  75 +++++++++++++
 client/duneapi/models.go  |  14 +++
 cmd/main.go               |  44 ++++++--
 config/config.go          |  11 +-
 go.mod                    |   1 +
 go.sum                    |   2 +
 ingester/ingester.go      |  69 +++++++-----
 ingester/mainloop.go      | 183 +++++++++++++++++++++++++++++++-
 ingester/mainloop_test.go | 215 ++++++++++++++++++++++++++++++++++++--
 ingester/send.go          |  23 ++--
 lib/dlq/dlq.go            |  94 +++++++++++++++++
 mocks/duneapi/client.go   |  44 ++++++++
 models/gaps.go            |  10 ++
 14 files changed, 736 insertions(+), 54 deletions(-)
 create mode 100644 lib/dlq/dlq.go
 create mode 100644 models/gaps.go

diff --git a/.gitignore b/.gitignore
index 9424251..f2eccb6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,4 +22,7 @@ go.work
 go.work.sum
 
 # Binary
-indexer
\ No newline at end of file
+indexer
+bin
+
+.idea
diff --git a/client/duneapi/client.go b/client/duneapi/client.go
index c75aa5c..29acf1e 100644
--- a/client/duneapi/client.go
+++ b/client/duneapi/client.go
@@ -33,6 +33,8 @@ type BlockchainIngester interface {
 	// PostProgressReport sends a progress report to DuneAPI
 	PostProgressReport(ctx context.Context, progress models.BlockchainIndexProgress) error
 
+	GetBlockGaps(ctx context.Context) (*models.BlockchainGaps, error)
+
 	// - API to discover the latest block number ingested
 	//   this can also provide "next block ranges" to push to DuneAPI
 	// - log/metrics on catching up/falling behind, distance from tip of chain
@@ -366,3 +368,76 @@ func (c *client) GetProgressReport(ctx context.Context) (*models.BlockchainIndex
 	}
 	return progress, nil
 }
+
+func (c *client) GetBlockGaps(ctx context.Context) (*models.BlockchainGaps, error) {
+	var response BlockchainGapsResponse
+	var err error
+	var responseStatus string
+	start := time.Now()
+
+	// Log response
+	defer func() {
+		if err != nil {
+			c.log.Error("Getting block gaps failed",
+				"error", err,
+				"statusCode", responseStatus,
+				"duration", time.Since(start),
+			)
+		} else {
+			c.log.Info("Got block gaps",
+				"blockGaps", response.String(),
+				"duration", time.Since(start),
+			)
+		}
+	}()
+
+	url := fmt.Sprintf("%s/api/beta/blockchain/%s/gaps", c.cfg.URL, c.cfg.BlockchainName)
+	c.log.Debug("Sending request", "url", url)
+	req, err := retryablehttp.NewRequestWithContext(ctx, "GET", url, nil) // nil: empty body
+	if err != nil {
+		return nil, err
+	}
+	req.Header.Set("x-dune-api-key", c.cfg.APIKey)
+	req = req.WithContext(ctx)
+	resp, err := c.httpClient.Do(req)
+	if err != nil {
+		return nil, err
+	}
+	defer resp.Body.Close()
+
+	responseBody, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return nil, err
+	}
+
+	if resp.StatusCode != http.StatusOK {
+		bs, _ := io.ReadAll(resp.Body)
+		responseBody := string(bs)
+		// We mutate the global err here because we have deferred a log message where we check for non-nil err
+		err = fmt.Errorf("unexpected status code: %v, %v with body '%s'", resp.StatusCode, resp.Status, responseBody)
+		return nil, err
+	}
+
+	err = json.Unmarshal(responseBody, &response)
+	if err != nil {
+		return nil, err
+	}
+
+	gaps := &models.BlockchainGaps{
+		Gaps: mapSlice(response.Gaps, func(gap BlockGap) models.BlockGap {
+			return models.BlockGap{
+				FirstMissing: gap.FirstMissing,
+				LastMissing:  gap.LastMissing,
+			}
+		}),
+	}
+	return gaps, nil
+}
+
+func mapSlice[T any, U any](slice []T, mapper func(T) U) []U {
+	result := make([]U, len(slice))
+	for i, v := range slice {
+		result[i] = mapper(v)
+	}
+	return result
+}
diff --git a/client/duneapi/models.go b/client/duneapi/models.go
index d9ad542..0a1e12e 100644
--- a/client/duneapi/models.go
+++ b/client/duneapi/models.go
@@ -83,3 +83,17 @@ type BlockchainError struct {
 	Error        string    `json:"error"`
 	Source       string    `json:"source"`
 }
+
+type BlockchainGapsResponse struct {
+	Gaps []BlockGap `json:"gaps"`
+}
+
+// BlockGap declares an inclusive range of missing block numbers
+type BlockGap struct {
+	FirstMissing int64 `json:"first_missing"`
+	LastMissing  int64 `json:"last_missing"`
+}
+
+func (b *BlockchainGapsResponse) String() string {
+	return fmt.Sprintf("%+v", *b)
+}
diff --git a/cmd/main.go b/cmd/main.go
index b1c7e86..dd15eae 100644
--- a/cmd/main.go
+++ b/cmd/main.go
@@ -18,6 +18,7 @@ import (
 	"github.com/duneanalytics/blockchain-ingester/client/jsonrpc"
 	"github.com/duneanalytics/blockchain-ingester/config"
 	"github.com/duneanalytics/blockchain-ingester/ingester"
+	"github.com/duneanalytics/blockchain-ingester/lib/dlq"
 	"github.com/duneanalytics/blockchain-ingester/models"
 )
 
@@ -59,6 +60,19 @@ func main() {
 	}
 	defer duneClient.Close()
 
+	// Create an extra Dune API client for DLQ processing since it is not thread-safe yet
+	duneClientDLQ, err := duneapi.New(logger, duneapi.Config{
+		APIKey:             cfg.Dune.APIKey,
+		URL:                cfg.Dune.URL,
+		BlockchainName:     cfg.BlockchainName,
+		Stack:              cfg.RPCStack,
+		DisableCompression: cfg.DisableCompression,
+	})
+	if err != nil {
+		stdlog.Fatal(err)
+	}
+	defer duneClientDLQ.Close()
+
 	var wg stdsync.WaitGroup
 	var rpcClient jsonrpc.BlockchainClient
 
@@ -101,21 +115,37 @@ func main() {
 		startBlockNumber = cfg.BlockHeight
 	}
 
+	dlqBlockNumbers := dlq.NewDLQWithDelay[int64](dlq.RetryDelayLinear(cfg.DLQRetryInterval))
+
+	if !cfg.DisableGapsQuery {
+		blockGaps, err := duneClient.GetBlockGaps(ctx)
+		if err != nil {
+			stdlog.Fatal(err)
+		} else {
+			ingester.AddBlockGaps(dlqBlockNumbers, blockGaps.Gaps)
+		}
+	}
+
 	maxCount := int64(0) // 0 means ingest until cancelled
 	ingester := ingester.New(
 		logger,
 		rpcClient,
 		duneClient,
+		duneClientDLQ,
 		ingester.Config{
-			MaxConcurrentRequests:  cfg.RPCConcurrency,
-			ReportProgressInterval: cfg.ReportProgressInterval,
-			PollInterval:           cfg.PollInterval,
-			Stack:                  cfg.RPCStack,
-			BlockchainName:         cfg.BlockchainName,
-			BlockSubmitInterval:    cfg.BlockSubmitInterval,
-			SkipFailedBlocks:       cfg.RPCNode.SkipFailedBlocks,
+			MaxConcurrentRequests:    cfg.RPCConcurrency,
+			MaxConcurrentRequestsDLQ: cfg.DLQConcurrency,
+			ReportProgressInterval:   cfg.ReportProgressInterval,
+			PollInterval:             cfg.PollInterval,
+			PollDLQInterval:          cfg.PollDLQInterval,
+			Stack:                    cfg.RPCStack,
+			BlockchainName:           cfg.BlockchainName,
+			BlockSubmitInterval:      cfg.BlockSubmitInterval,
+			SkipFailedBlocks:         cfg.RPCNode.SkipFailedBlocks,
+			DLQOnly:                  cfg.DLQOnly,
 		},
 		progress,
+		dlqBlockNumbers,
 	)
 
 	wg.Add(1)
diff --git a/config/config.go b/config/config.go
index 8259943..94d2f9d 100644
--- a/config/config.go
+++ b/config/config.go
@@ -42,15 +42,20 @@ func (r RPCClient) HasError() error {
 }
 
 type Config struct {
-	BlockHeight            int64  `long:"block-height" env:"BLOCK_HEIGHT" description:"block height to start from" default:"-1"`                     // nolint:lll
-	BlockchainName         string `long:"blockchain-name" env:"BLOCKCHAIN_NAME" description:"name of the blockchain" required:"true"`                // nolint:lll
-	DisableCompression     bool   `long:"disable-compression" env:"DISABLE_COMPRESSION" description:"disable compression when sending data to Dune"` // nolint:lll
+	BlockHeight            int64  `long:"block-height" env:"BLOCK_HEIGHT" description:"block height to start from" default:"-1"`                         // nolint:lll
+	BlockchainName         string `long:"blockchain-name" env:"BLOCKCHAIN_NAME" description:"name of the blockchain" required:"true"`                    // nolint:lll
+	DisableCompression     bool   `long:"disable-compression" env:"DISABLE_COMPRESSION" description:"disable compression when sending data to Dune"`     // nolint:lll
+	DisableGapsQuery       bool   `long:"disable-gaps-query" env:"DISABLE_GAPS_QUERY" description:"disable gaps query used to populate the initial DLQ"` // nolint:lll
+	DLQOnly                bool   `long:"dlq-only" env:"DLQ_ONLY" description:"Runs just the DLQ processing on its own"`                                 // nolint:lll
 	Dune                   DuneClient
 	PollInterval           time.Duration `long:"rpc-poll-interval" env:"RPC_POLL_INTERVAL" description:"Interval to poll the blockchain node" default:"300ms"`    // nolint:lll
+	PollDLQInterval        time.Duration `long:"dlq-poll-interval" env:"DLQ_POLL_INTERVAL" description:"Interval to poll the dlq" default:"300ms"`                // nolint:lll
+	DLQRetryInterval       time.Duration `long:"dlq-retry-interval" env:"DLQ_RETRY_INTERVAL" description:"Interval for linear backoff in DLQ " default:"1m"`      // nolint:lll
 	ReportProgressInterval time.Duration `long:"report-progress-interval" env:"REPORT_PROGRESS_INTERVAL" description:"Interval to report progress" default:"30s"` // nolint:lll
 	RPCNode                RPCClient
 	RPCStack               models.EVMStack `long:"rpc-stack" env:"RPC_STACK" description:"Stack for the RPC client" default:"opstack"`                                                 // nolint:lll
 	RPCConcurrency         int             `long:"rpc-concurrency" env:"RPC_CONCURRENCY" description:"Number of concurrent requests to the RPC node" default:"25"`                     // nolint:lll
+	DLQConcurrency         int             `long:"dlq-concurrency" env:"DLQ_CONCURRENCY" description:"Number of concurrent requests to the RPC node for DLQ processing" default:"2"`   // nolint:lll
 	BlockSubmitInterval    time.Duration   `long:"block-submit-interval" env:"BLOCK_SUBMIT_INTERVAL" description:"Interval at which to submit batched blocks to Dune" default:"500ms"` // nolint:lll
 	LogLevel               string          `long:"log" env:"LOG" description:"Log level" choice:"info" choice:"debug" choice:"warn" choice:"error" default:"info"`                     // nolint:lll
 }
diff --git a/go.mod b/go.mod
index c506fe2..26973a4 100644
--- a/go.mod
+++ b/go.mod
@@ -3,6 +3,7 @@ module github.com/duneanalytics/blockchain-ingester
 go 1.22.2
 
 require (
+	github.com/emirpasic/gods v1.18.1
 	github.com/go-errors/errors v1.5.1
 	github.com/hashicorp/go-retryablehttp v0.7.7
 	github.com/jessevdk/go-flags v1.5.0
diff --git a/go.sum b/go.sum
index 7ee828f..1835ce7 100644
--- a/go.sum
+++ b/go.sum
@@ -1,5 +1,7 @@
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc=
+github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ=
 github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM=
 github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE=
 github.com/go-errors/errors v1.5.1 h1:ZwEMSLRCapFLflTpT7NKaAc7ukJ8ZPEjzlxt8rPN8bk=
diff --git a/ingester/ingester.go b/ingester/ingester.go
index 82925fd..16dec59 100644
--- a/ingester/ingester.go
+++ b/ingester/ingester.go
@@ -5,6 +5,8 @@ import (
 	"log/slog"
 	"time"
 
+	"github.com/duneanalytics/blockchain-ingester/lib/dlq"
+
 	"github.com/duneanalytics/blockchain-ingester/client/duneapi"
 	"github.com/duneanalytics/blockchain-ingester/client/jsonrpc"
 	"github.com/duneanalytics/blockchain-ingester/models"
@@ -24,48 +26,67 @@ type Ingester interface {
 	// It can safely be run concurrently.
 	FetchBlockLoop(context.Context, chan int64, chan models.RPCBlock) error
 
-	// SendBlocks pushes to DuneAPI the RPCBlock Payloads as they are received in an endless loop
+	// SendBlocks consumes RPCBlocks from the channel, reorders them, and sends batches to DuneAPI in an endless loop
 	// it will block until:
 	//	- the context is cancelled
 	//  - channel is closed
 	//  - a fatal error occurs
 	SendBlocks(ctx context.Context, blocksCh <-chan models.RPCBlock, startFrom int64) error
 
-	// This is just a placeholder for now
-	Info() Info
+	// ProduceBlockNumbersDLQ sends block numbers from the DLQ to outChan.
+	// It will run continuously until the context is cancelled.
+	// When the DLQ does not return an eligible next block, it waits for PollDLQInterval before trying again
+	ProduceBlockNumbersDLQ(ctx context.Context, outChan chan dlq.Item[int64]) error
+
+	// FetchBlockLoopDLQ fetches blocks sent on the channel and sends them on the other channel.
+	// It will run continuously until the context is cancelled, or the channel is closed.
+	// It can safely be run concurrently.
+	FetchBlockLoopDLQ(ctx context.Context,
+		blockNumbers <-chan dlq.Item[int64],
+		blocks chan<- dlq.Item[models.RPCBlock],
+	) error
+
+	// SendBlocksDLQ pushes one RPCBlock at a time to DuneAPI in the order they are received in
+	SendBlocksDLQ(ctx context.Context, blocks <-chan dlq.Item[models.RPCBlock]) error
 
 	Close() error
 }
 
 const (
-	defaultMaxBatchSize           = 5
 	defaultReportProgressInterval = 30 * time.Second
 )
 
 type Config struct {
-	MaxConcurrentRequests  int
-	PollInterval           time.Duration
-	ReportProgressInterval time.Duration
-	Stack                  models.EVMStack
-	BlockchainName         string
-	BlockSubmitInterval    time.Duration
-	SkipFailedBlocks       bool
+	MaxConcurrentRequests    int
+	MaxConcurrentRequestsDLQ int
+	PollInterval             time.Duration
+	PollDLQInterval          time.Duration
+	ReportProgressInterval   time.Duration
+	Stack                    models.EVMStack
+	BlockchainName           string
+	BlockSubmitInterval      time.Duration
+	SkipFailedBlocks         bool
+	DLQOnly                  bool
 }
 
 type ingester struct {
-	log  *slog.Logger
-	node jsonrpc.BlockchainClient
-	dune duneapi.BlockchainIngester
-	cfg  Config
-	info Info
+	log     *slog.Logger
+	node    jsonrpc.BlockchainClient
+	dune    duneapi.BlockchainIngester
+	duneDLQ duneapi.BlockchainIngester
+	cfg     Config
+	info    Info
+	dlq     *dlq.DLQ[int64]
 }
 
 func New(
 	log *slog.Logger,
 	node jsonrpc.BlockchainClient,
 	dune duneapi.BlockchainIngester,
+	duneDLQ duneapi.BlockchainIngester,
 	cfg Config,
 	progress *models.BlockchainIndexProgress,
+	dlq *dlq.DLQ[int64],
 ) Ingester {
 	info := NewInfo(cfg.BlockchainName, cfg.Stack.String())
 	if progress != nil {
@@ -73,18 +94,16 @@ func New(
 		info.IngestedBlockNumber = progress.LastIngestedBlockNumber
 	}
 	ing := &ingester{
-		log:  log.With("module", "ingester"),
-		node: node,
-		dune: dune,
-		cfg:  cfg,
-		info: info,
+		log:     log.With("module", "ingester"),
+		node:    node,
+		dune:    dune,
+		duneDLQ: duneDLQ,
+		cfg:     cfg,
+		info:    info,
+		dlq:     dlq,
 	}
 	if ing.cfg.ReportProgressInterval == 0 {
 		ing.cfg.ReportProgressInterval = defaultReportProgressInterval
 	}
 	return ing
 }
-
-func (i *ingester) Info() Info {
-	return i.info
-}
diff --git a/ingester/mainloop.go b/ingester/mainloop.go
index fb73172..b417d38 100644
--- a/ingester/mainloop.go
+++ b/ingester/mainloop.go
@@ -3,9 +3,14 @@ package ingester
 import (
 	"context"
 	"fmt"
+	"log/slog"
+	"slices"
 	"sync/atomic"
 	"time"
 
+	"github.com/duneanalytics/blockchain-ingester/lib/dlq"
+	"github.com/emirpasic/gods/utils"
+
 	"github.com/duneanalytics/blockchain-ingester/models"
 	"github.com/go-errors/errors"
 	"golang.org/x/sync/errgroup"
@@ -20,6 +25,18 @@ import (
 // The SendBlocks goroutine receives all blocks on an unbuffered channel,
 // but buffers them in a map until they can be sent in order.
 func (i *ingester) Run(ctx context.Context, startBlockNumber int64, maxCount int64) error {
+	//
+	if i.cfg.DLQOnly {
+		i.cfg.MaxConcurrentRequests = 0 // if running DLQ Only mode, ignore the MaxConcurrentRequests and set this to 0
+	} else {
+		if i.cfg.MaxConcurrentRequests <= 0 {
+			return errors.Errorf("MaxConcurrentRequests must be > 0")
+		}
+	}
+	if i.cfg.MaxConcurrentRequestsDLQ <= 0 {
+		return errors.Errorf("MaxConcurrentRequestsDLQ must be > 0")
+	}
+
 	ctx, cancel := context.WithCancel(ctx)
 	defer cancel()
 	errGroup, ctx := errgroup.WithContext(ctx)
@@ -33,9 +50,6 @@ func (i *ingester) Run(ctx context.Context, startBlockNumber int64, maxCount int
 	defer close(blocks)
 
 	// Start MaxBatchSize goroutines to consume blocks concurrently
-	if i.cfg.MaxConcurrentRequests <= 0 {
-		return errors.Errorf("MaxConcurrentRequests must be > 0")
-	}
 	for range i.cfg.MaxConcurrentRequests {
 		errGroup.Go(func() error {
 			return i.FetchBlockLoop(ctx, blockNumbers, blocks)
@@ -48,6 +62,26 @@ func (i *ingester) Run(ctx context.Context, startBlockNumber int64, maxCount int
 		return i.SendBlocks(ctx, blocks, startBlockNumber)
 	})
 
+	// Start DLQ processing
+
+	blockNumbersDLQ := make(chan dlq.Item[int64])
+	defer close(blockNumbersDLQ)
+
+	blocksDLQ := make(chan dlq.Item[models.RPCBlock], i.cfg.MaxConcurrentRequestsDLQ+1)
+	defer close(blocksDLQ)
+
+	errGroup.Go(func() error {
+		return i.SendBlocksDLQ(ctx, blocksDLQ)
+	})
+	for range i.cfg.MaxConcurrentRequestsDLQ {
+		errGroup.Go(func() error {
+			return i.FetchBlockLoopDLQ(ctx, blockNumbersDLQ, blocksDLQ)
+		})
+	}
+	errGroup.Go(func() error {
+		return i.ProduceBlockNumbersDLQ(ctx, blockNumbersDLQ)
+	})
+
 	// Ingest until endBlockNumber, inclusive. If maxCount is <= 0, we ingest forever
 	endBlockNumber := startBlockNumber - 1 + maxCount
 	i.log.Info("Starting ingester",
@@ -131,14 +165,19 @@ func (i *ingester) FetchBlockLoop(
 
 				i.log.Error("Failed to get block by number",
 					"blockNumber", blockNumber,
-					"continueing", i.cfg.SkipFailedBlocks,
+					"continuing", i.cfg.SkipFailedBlocks,
 					"elapsed", time.Since(startTime),
 					"error", err,
 				)
 				if !i.cfg.SkipFailedBlocks {
 					return err
 				}
-				blocks <- models.RPCBlock{BlockNumber: blockNumber, Error: err}
+				select {
+				case <-ctx.Done():
+					i.log.Debug("FetchBlockLoop: Channel is closed, not sending block to channel", "blockNumber", block.BlockNumber)
+					return ctx.Err()
+				case blocks <- models.RPCBlock{BlockNumber: blockNumber, Error: err}:
+				}
 				continue
 			}
 
@@ -223,6 +262,127 @@ func (i *ingester) ReportProgress(ctx context.Context) error {
 	}
 }
 
+func (i *ingester) ProduceBlockNumbersDLQ(ctx context.Context, outChan chan dlq.Item[int64]) error {
+	for {
+		select {
+		case <-ctx.Done():
+			i.log.Debug("ProduceBlockNumbersDLQ: Context canceled, stopping")
+			return ctx.Err()
+		default:
+			block, ok := i.dlq.GetNextItem()
+
+			if ok {
+				if i.log.Enabled(ctx, slog.LevelDebug) {
+					i.log.Debug("ProduceBlockNumbersDLQ: Reprocessing block", "block", block,
+						"dlqSize", i.dlq.Size())
+				}
+				select {
+				case outChan <- *block:
+					// Successfully sent the block to the out channel
+				case <-ctx.Done():
+					i.log.Debug("ProduceBlockNumbersDLQ: Context canceled while sending block, stopping")
+					return ctx.Err()
+				}
+			} else {
+				if i.log.Enabled(ctx, slog.LevelDebug) {
+					i.log.Debug("ProduceBlockNumbersDLQ: No eligible blocks in the DLQ so sleeping",
+						"dlqSize", i.dlq.Size())
+				}
+				select {
+				case <-time.After(i.cfg.PollDLQInterval): // Polling interval when DLQ is empty
+				case <-ctx.Done():
+					i.log.Debug("ProduceBlockNumbersDLQ: Context canceled while sleeping, stopping")
+					return ctx.Err()
+				}
+			}
+		}
+	}
+}
+
+func (i *ingester) FetchBlockLoopDLQ(ctx context.Context, blockNumbers <-chan dlq.Item[int64],
+	blocks chan<- dlq.Item[models.RPCBlock],
+) error {
+	for {
+		select {
+		case <-ctx.Done():
+			i.log.Info("FetchBlockLoopDLQ: context is done")
+			return ctx.Err()
+		case blockNumber := <-blockNumbers:
+			startTime := time.Now()
+			block, err := i.node.BlockByNumber(ctx, blockNumber.Value)
+			if err != nil {
+				if errors.Is(err, context.Canceled) {
+					i.log.Error("FetchBlockLoopDLQ: Context canceled, stopping")
+					return ctx.Err()
+				}
+				i.log.Error("FetchBlockLoopDLQ: Failed to get block by number",
+					"blockNumber", blockNumber,
+					"elapsed", time.Since(startTime),
+					"error", err,
+				)
+				select {
+				case <-ctx.Done():
+					i.log.Debug("FetchBlockLoop: Channel is closed, not sending block to channel", "blockNumber", block.BlockNumber)
+					return ctx.Err()
+				case blocks <- dlq.MapItem(blockNumber, func(blockNumber int64) models.RPCBlock {
+					return models.RPCBlock{BlockNumber: blockNumber, Error: err}
+				}):
+				}
+				continue
+			}
+			getBlockElapsed := time.Since(startTime)
+			select {
+			case <-ctx.Done():
+				i.log.Debug("FetchBlockLoopDLQ: Channel is closed, not sending block to channel", "blockNumber", block.BlockNumber)
+				return ctx.Err()
+			case blocks <- dlq.MapItem(blockNumber, func(_ int64) models.RPCBlock {
+				return block
+			}):
+				i.log.Debug(
+					"FetchBlockLoopDLQ: Got and sent block",
+					"blockNumber", blockNumber,
+					"getBlockElapsed", getBlockElapsed,
+				)
+			}
+		}
+	}
+}
+
+func (i *ingester) SendBlocksDLQ(ctx context.Context, blocks <-chan dlq.Item[models.RPCBlock]) error {
+	i.log.Debug("SendBlocksDLQ: Starting to receive blocks")
+	for {
+		select {
+		case <-ctx.Done():
+			i.log.Debug("SendBlocksDLQ: Context canceled, stopping")
+			return ctx.Err()
+		case block, ok := <-blocks:
+			if !ok {
+				i.log.Debug("SendBlocksDLQ: Channel is closed, returning")
+				return nil
+			}
+			if block.Value.Errored() {
+				i.dlq.AddItem(block.Value.BlockNumber, block.Retries)
+				i.log.Error("Received FAILED block", "number", block.Value.BlockNumber)
+				// TODO: report error once ErrorState struct is made thread-safe
+			} else {
+				i.log.Debug(
+					"SendBlocksDLQ: Received block",
+					"blockNumber", block.Value.BlockNumber,
+				)
+				if err := i.duneDLQ.SendBlocks(ctx, []models.RPCBlock{block.Value}); err != nil {
+					if errors.Is(err, context.Canceled) {
+						i.log.Info("SendBlocksDLQ: Context canceled, stopping")
+						return ctx.Err()
+					}
+					i.log.Error("SendBlocksDLQ: Failed to send block, requeueing...", "block", block.Value.BlockNumber, "error", err)
+					i.dlq.AddItem(block.Value.BlockNumber, block.Retries)
+					// TODO: report error once ErrorState struct is made thread-safe
+				}
+			}
+		}
+	}
+}
+
 func (i *ingester) Close() error {
 	// Send a final progress report to flush progress
 	ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
@@ -237,3 +397,16 @@ func (i *ingester) Close() error {
 
 	return i.node.Close()
 }
+
+func AddBlockGaps(dlq *dlq.DLQ[int64], gaps []models.BlockGap) {
+	// queue these in reverse so that recent blocks are retried first
+	slices.SortFunc(gaps, func(a, b models.BlockGap) int {
+		return -utils.Int64Comparator(a.FirstMissing, b.FirstMissing)
+	})
+
+	for _, gap := range gaps {
+		for i := gap.FirstMissing; i <= gap.LastMissing; i++ {
+			dlq.AddItem(i, 0)
+		}
+	}
+}
diff --git a/ingester/mainloop_test.go b/ingester/mainloop_test.go
index d65e6a4..8209f1e 100644
--- a/ingester/mainloop_test.go
+++ b/ingester/mainloop_test.go
@@ -10,6 +10,8 @@ import (
 	"testing"
 	"time"
 
+	"github.com/duneanalytics/blockchain-ingester/lib/dlq"
+
 	"github.com/duneanalytics/blockchain-ingester/ingester"
 	duneapi_mock "github.com/duneanalytics/blockchain-ingester/mocks/duneapi"
 	jsonrpc_mock "github.com/duneanalytics/blockchain-ingester/mocks/jsonrpc"
@@ -73,12 +75,15 @@ func TestRunUntilCancel(t *testing.T) {
 		slog.New(slog.NewTextHandler(logOutput, nil)),
 		rpcClient,
 		duneapi,
+		duneapi,
 		ingester.Config{
-			BlockSubmitInterval:   time.Nanosecond,
-			MaxConcurrentRequests: 10,
-			SkipFailedBlocks:      false,
+			BlockSubmitInterval:      time.Nanosecond,
+			MaxConcurrentRequests:    10,
+			MaxConcurrentRequestsDLQ: 2,
+			SkipFailedBlocks:         false,
 		},
 		nil, // progress
+		dlq.NewDLQ[int64](),
 	)
 
 	err := ing.Run(ctx, 1, -1)                // run until canceled
@@ -110,10 +115,12 @@ func TestProduceBlockNumbers(t *testing.T) {
 		slog.New(slog.NewTextHandler(logOutput, nil)),
 		rpcClient,
 		duneapi,
+		duneapi,
 		ingester.Config{
 			BlockSubmitInterval: time.Nanosecond,
 		},
 		nil, // progress
+		dlq.NewDLQ[int64](),
 	)
 	blockNumbers := make(chan int64)
 	var wg sync.WaitGroup
@@ -155,10 +162,12 @@ func TestSendBlocks(t *testing.T) {
 		slog.New(slog.NewTextHandler(logOutput, nil)),
 		nil, // node client isn't used in this unit test
 		duneapi,
+		duneapi,
 		ingester.Config{
 			BlockSubmitInterval: time.Nanosecond,
 		},
 		nil, // progress
+		dlq.NewDLQ[int64](),
 	)
 
 	blocks := make(chan models.RPCBlock)
@@ -245,14 +254,17 @@ func TestRunBlocksOutOfOrder(t *testing.T) {
 		slog.New(slog.NewTextHandler(logOutput, nil)),
 		rpcClient,
 		duneapi,
+		duneapi,
 		ingester.Config{
-			MaxConcurrentRequests: 20, // fetch blocks in multiple goroutines
+			MaxConcurrentRequests:    20,
+			MaxConcurrentRequestsDLQ: 2, // fetch blocks in multiple goroutines
 			// big enough compared to the time spent in block by number to ensure batching. We panic
 			// in the mocked Dune client if we don't get a batch of blocks (more than one block).
 			BlockSubmitInterval: 50 * time.Millisecond,
 			SkipFailedBlocks:    false,
 		},
 		nil, // progress
+		dlq.NewDLQ[int64](),
 	)
 
 	err := ing.Run(ctx, 1, -1)                // run until canceled
@@ -294,12 +306,15 @@ func TestRunRPCNodeFails(t *testing.T) {
 		slog.New(slog.NewTextHandler(logOutput, nil)),
 		rpcClient,
 		duneapi,
+		duneapi,
 		ingester.Config{
-			MaxConcurrentRequests: 10,
-			BlockSubmitInterval:   time.Millisecond,
-			SkipFailedBlocks:      false,
+			MaxConcurrentRequests:    10,
+			MaxConcurrentRequestsDLQ: 2,
+			BlockSubmitInterval:      time.Millisecond,
+			SkipFailedBlocks:         false,
 		},
 		nil, // progress
+		dlq.NewDLQ[int64](),
 	)
 
 	err := ing.Run(ctx, 1, -1) // run until canceled
@@ -313,12 +328,198 @@ func TestRunFailsIfNoConcurrentRequests(t *testing.T) {
 		slog.New(slog.NewTextHandler(logOutput, nil)),
 		nil,
 		nil,
+		nil,
 		ingester.Config{
 			MaxConcurrentRequests: 0,
 		},
 		nil, // progress
+		dlq.NewDLQ[int64](),
 	)
 
 	err := ing.Run(context.Background(), 1, -1) // run until canceled
 	require.ErrorContains(t, err, "MaxConcurrentRequests must be > 0")
 }
+
+func TestRunFailsIfNoConcurrentRequestsDLQ(t *testing.T) {
+	logOutput := io.Discard
+	ing := ingester.New(
+		slog.New(slog.NewTextHandler(logOutput, nil)),
+		nil,
+		nil,
+		nil,
+		ingester.Config{
+			MaxConcurrentRequests:    10,
+			MaxConcurrentRequestsDLQ: 0,
+		},
+		nil, // progress
+		dlq.NewDLQ[int64](),
+	)
+
+	err := ing.Run(context.Background(), 1, -1) // run until canceled
+	require.ErrorContains(t, err, "MaxConcurrentRequestsDLQ must be > 0")
+}
+
+func TestRunWithDLQ(t *testing.T) {
+	ctx, cancel := context.WithCancel(context.Background())
+	maxBlockNumber := int64(1000)
+	startBlockNumber := int64(10)
+
+	// Initial DLQ
+	dlqBlockNumbers := dlq.NewDLQWithDelay[int64](dlq.RetryDelayLinear(time.Duration(10) * time.Millisecond))
+	gaps := []models.BlockGap{
+		{
+			FirstMissing: 9,
+			LastMissing:  9,
+		}, {
+			FirstMissing: 3,
+			LastMissing:  7,
+		}, {
+			FirstMissing: 0,
+			LastMissing:  0,
+		},
+	}
+	ingester.AddBlockGaps(dlqBlockNumbers, gaps)
+
+	// blockNumber int64 -> timesSubmitted int
+	var blocksIndexed sync.Map
+	// Prepopulate expected blocks
+	for i := int64(0); i < maxBlockNumber; i++ {
+		blocksIndexed.Store(i, 0)
+	}
+	// Add those that aren't considered as previous gaps
+	incrementAndGet(&blocksIndexed, int64(1))
+	incrementAndGet(&blocksIndexed, int64(2))
+	incrementAndGet(&blocksIndexed, int64(8))
+
+	// Dune API Mocking
+	var sendBlocksRequests sync.Map
+	duneapi := &duneapi_mock.BlockchainIngesterMock{
+		SendBlocksFunc: func(_ context.Context, blocks []models.RPCBlock) error {
+			if len(blocks) == 0 {
+				return nil
+			}
+			// Count Requests by block number
+			for _, block := range blocks {
+				incrementAndGet(&sendBlocksRequests, block.BlockNumber)
+			}
+
+			// Fail if this batch contains a block number that hasn't been requested at least twice before this call
+			for _, block := range blocks {
+				requests, _ := sendBlocksRequests.Load(block.BlockNumber)
+				if requests.(int) <= 2 {
+					return errors.Errorf("failing batch due to %v having only been requested %v times",
+						block.BlockNumber, requests)
+				}
+			}
+
+			// Count blocks as indexed by block number
+			for _, block := range blocks {
+				incrementAndGet(&blocksIndexed, block.BlockNumber)
+			}
+
+			// Look for gaps
+			if !duneStoreContainsGaps(&blocksIndexed, maxBlockNumber) {
+				// cancel execution when we have sent all blocks
+				cancel()
+				return context.Canceled
+			}
+
+			return nil
+		},
+		PostProgressReportFunc: func(_ context.Context, _ models.BlockchainIndexProgress) error {
+			return nil
+		},
+	}
+
+	// RPC Mocking
+	var rpcBlocksRequests sync.Map
+	rpcClient := &jsonrpc_mock.BlockchainClientMock{
+		LatestBlockNumberFunc: func() (int64, error) {
+			return maxBlockNumber + 1, nil
+		},
+		BlockByNumberFunc: func(_ context.Context, blockNumber int64) (models.RPCBlock, error) {
+			incrementAndGet(&rpcBlocksRequests, blockNumber)
+
+			// Fail every 10th block numbers the first 2 times
+			if blockNumber%10 == 0 {
+				requests, _ := rpcBlocksRequests.Load(blockNumber)
+				if requests.(int) <= 2 {
+					return models.RPCBlock{},
+						errors.Errorf("failing rpc request due to %v having only been requested %v times",
+							blockNumber, requests)
+				}
+			}
+
+			return models.RPCBlock{
+				BlockNumber: blockNumber,
+				Payload:     []byte(`block`),
+			}, nil
+		},
+		CloseFunc: func() error {
+			return nil
+		},
+	}
+	// Swap these to see logs
+	// logOutput := os.Stderr
+	logOutput := io.Discard
+	ing := ingester.New(
+		slog.New(slog.NewTextHandler(logOutput, nil)),
+		rpcClient,
+		duneapi,
+		duneapi,
+		ingester.Config{
+			BlockSubmitInterval:      time.Nanosecond,
+			MaxConcurrentRequests:    10,
+			MaxConcurrentRequestsDLQ: 1,
+			DLQOnly:                  false,
+			SkipFailedBlocks:         true,
+		},
+		nil, // progress
+		dlqBlockNumbers,
+	)
+
+	err := ing.Run(ctx, startBlockNumber, -1) // run until canceled
+	require.False(t, duneStoreContainsGaps(&blocksIndexed, maxBlockNumber))
+	require.GreaterOrEqual(t, lenSyncMap(&blocksIndexed), int(maxBlockNumber))
+	require.ErrorIs(t, err, context.Canceled) // this is expected
+}
+
+func duneStoreContainsGaps(blocksIndexed *sync.Map, maxBlockNumber int64) bool {
+	containsGap := false
+	blocksIndexed.Range(func(key, value any) bool {
+		blockNumber := key.(int64)
+		count := value.(int)
+		if blockNumber <= maxBlockNumber && count < 1 {
+			containsGap = true
+			return false
+		}
+		return true
+	})
+	return containsGap
+}
+
+func incrementAndGet(m *sync.Map, key interface{}) int {
+	for {
+		// Load the current value associated with the key else initialise
+		currentValue, _ := m.LoadOrStore(key, 0)
+
+		// Increment the current value.
+		newValue := currentValue.(int) + 1
+
+		// Attempt to store the new value back into the sync.Map. Compare-and-swap (CAS) approach ensures atomicity.
+		if m.CompareAndSwap(key, currentValue, newValue) {
+			// If the swap succeeded, return the new value.
+			return newValue
+		}
+		// If the swap failed, it means the value was updated by another goroutine. Retry the operation.
+	}
+}
+
+func lenSyncMap(m *sync.Map) int {
+	length := 0
+	m.Range(func(_, _ interface{}) bool {
+		length++
+		return true
+	})
+	return length
+}
diff --git a/ingester/send.go b/ingester/send.go
index 8b69e01..50a90ac 100644
--- a/ingester/send.go
+++ b/ingester/send.go
@@ -91,6 +91,9 @@ func (i *ingester) trySendBlockBatch(
 	for block, ok := collectedBlocks[nextBlockToSend]; ok; block, ok = collectedBlocks[nextBlockToSend] {
 		// Skip Failed block if we're configured to skip Failed blocks
 		if i.cfg.SkipFailedBlocks && block.Errored() {
+			i.log.Error("SendBlocks: RPCBlock has an error, requeueing...", "block", block.BlockNumber, "error", block.Error)
+			i.dlq.AddItemHighPriority(block.BlockNumber)
+			delete(collectedBlocks, nextBlockToSend)
 			nextBlockToSend++
 			continue
 		}
@@ -110,7 +113,7 @@ func (i *ingester) trySendBlockBatch(
 
 	// Send the batch
 	lastBlockNumber := blockBatch[len(blockBatch)-1].BlockNumber
-	if lastBlockNumber != nextBlockToSend-1 {
+	if !i.cfg.SkipFailedBlocks && lastBlockNumber != nextBlockToSend-1 {
 		panic("unexpected last block number")
 	}
 	if err := i.dune.SendBlocks(ctx, blockBatch); err != nil {
@@ -119,17 +122,25 @@ func (i *ingester) trySendBlockBatch(
 			return nextBlockToSend, nil
 		}
 
-		// Store error for reporting
-		blocknumbers := make([]string, len(blockBatch))
-		for i, block := range blockBatch {
-			blocknumbers[i] = fmt.Sprintf("%d", block.BlockNumber)
+		i.log.Error("SendBlocks: Failed to send batch, requeueing...",
+			"firstBlockInBatch", blockBatch[0].BlockNumber,
+			"lastBlockInBatch", lastBlockNumber, "error", err)
+		blockNumbers := make([]string, len(blockBatch))
+		for n, block := range blockBatch {
+			i.dlq.AddItemHighPriority(block.BlockNumber)
+			blockNumbers[n] = fmt.Sprintf("%d", block.BlockNumber)
 		}
 
 		i.info.Errors.ObserveDuneError(ErrorInfo{
 			Error:        err,
-			BlockNumbers: strings.Join(blocknumbers, ","),
+			BlockNumbers: strings.Join(blockNumbers, ","),
 		})
 
+		if i.cfg.SkipFailedBlocks {
+			i.log.Error("SendBlocks: Failed to send batch, continuing", "error", err)
+			return nextBlockToSend, nil
+		}
+
 		err := errors.Errorf("failed to send batch: %w", err)
 		i.log.Error("SendBlocks: Failed to send batch, exiting", "error", err)
 		return nextBlockToSend, err
diff --git a/lib/dlq/dlq.go b/lib/dlq/dlq.go
new file mode 100644
index 0000000..34ab7d9
--- /dev/null
+++ b/lib/dlq/dlq.go
@@ -0,0 +1,94 @@
+package dlq
+
+import (
+	"sync"
+	"time"
+
+	pq "github.com/emirpasic/gods/queues/priorityqueue"
+	"github.com/emirpasic/gods/utils"
+)
+
+type DLQ[T any] struct {
+	priorityQueue pq.Queue // structure not thread safe
+	mutex         sync.Mutex
+	retryDelay    RetryStrategy
+}
+
+// RetryStrategy takes the number of retries and returns a Duration.
+// This allows the caller to implement any strategy they like, be it constant, linear, exponential, etc.
+type RetryStrategy func(retries int) time.Duration
+
+// Item This is generic so that metadata about retries can be maintained in an envelope during processing for when
+// an item needs to make its way back onto the DLQ later
+type Item[T any] struct {
+	Value       T
+	Retries     int
+	nextRunTime time.Time
+}
+
+func MapItem[T, U any](b Item[T], mapper func(T) U) Item[U] {
+	return Item[U]{
+		Value:       mapper(b.Value),
+		Retries:     b.Retries,
+		nextRunTime: b.nextRunTime,
+	}
+}
+
+func NewDLQ[T any]() *DLQ[T] {
+	return NewDLQWithDelay[T](RetryDelayLinear(time.Minute))
+}
+
+func RetryDelayLinear(backoff time.Duration) RetryStrategy {
+	return func(retries int) time.Duration {
+		return time.Duration(retries) * backoff // retries must be converted to a Duration for multiplication
+	}
+}
+
+func NewDLQWithDelay[T any](retryDelay func(retries int) time.Duration) *DLQ[T] {
+	return &DLQ[T]{priorityQueue: *pq.NewWith(byNextRunTime[T]), retryDelay: retryDelay}
+}
+
+// Comparator function (sort by nextRunTime in ascending order)
+func byNextRunTime[T any](a, b interface{}) int {
+	return utils.TimeComparator(a.(Item[T]).nextRunTime, b.(Item[T]).nextRunTime)
+}
+
+func (dlq *DLQ[T]) AddItem(item T, retries int) {
+	nextRunTime := time.Now().Add(dlq.retryDelay(retries))
+
+	dlq.mutex.Lock()
+	defer dlq.mutex.Unlock()
+
+	dlq.priorityQueue.Enqueue(Item[T]{Value: item, Retries: retries + 1, nextRunTime: nextRunTime})
+}
+
+func (dlq *DLQ[T]) AddItemHighPriority(item T) {
+	dlq.mutex.Lock()
+	defer dlq.mutex.Unlock()
+
+	dlq.priorityQueue.Enqueue(Item[T]{Value: item, Retries: 0, nextRunTime: time.Time{}})
+}
+
+func (dlq *DLQ[T]) GetNextItem() (value *Item[T], ok bool) {
+	dlq.mutex.Lock()
+	defer dlq.mutex.Unlock()
+
+	peek, ok := dlq.priorityQueue.Peek()
+	if !ok || peek.(Item[T]).nextRunTime.After(time.Now()) {
+		return nil, false
+	}
+
+	item, ok := dlq.priorityQueue.Dequeue()
+	if ok {
+		itemCasted := item.(Item[T])
+		return &itemCasted, ok
+	}
+	return nil, ok
+}
+
+func (dlq *DLQ[T]) Size() int {
+	dlq.mutex.Lock()
+	defer dlq.mutex.Unlock()
+
+	return dlq.priorityQueue.Size()
+}
diff --git a/mocks/duneapi/client.go b/mocks/duneapi/client.go
index 64d979a..973a3f7 100644
--- a/mocks/duneapi/client.go
+++ b/mocks/duneapi/client.go
@@ -20,6 +20,9 @@ var _ duneapi.BlockchainIngester = &BlockchainIngesterMock{}
 //
 //		// make and configure a mocked duneapi.BlockchainIngester
 //		mockedBlockchainIngester := &BlockchainIngesterMock{
+//			GetBlockGapsFunc: func(ctx context.Context) (*models.BlockchainGaps, error) {
+//				panic("mock out the GetBlockGaps method")
+//			},
 //			GetProgressReportFunc: func(ctx context.Context) (*models.BlockchainIndexProgress, error) {
 //				panic("mock out the GetProgressReport method")
 //			},
@@ -36,6 +39,9 @@ var _ duneapi.BlockchainIngester = &BlockchainIngesterMock{}
 //
 //	}
 type BlockchainIngesterMock struct {
+	// GetBlockGapsFunc mocks the GetBlockGaps method.
+	GetBlockGapsFunc func(ctx context.Context) (*models.BlockchainGaps, error)
+
 	// GetProgressReportFunc mocks the GetProgressReport method.
 	GetProgressReportFunc func(ctx context.Context) (*models.BlockchainIndexProgress, error)
 
@@ -47,6 +53,11 @@ type BlockchainIngesterMock struct {
 
 	// calls tracks calls to the methods.
 	calls struct {
+		// GetBlockGaps holds details about calls to the GetBlockGaps method.
+		GetBlockGaps []struct {
+			// Ctx is the ctx argument value.
+			Ctx context.Context
+		}
 		// GetProgressReport holds details about calls to the GetProgressReport method.
 		GetProgressReport []struct {
 			// Ctx is the ctx argument value.
@@ -67,11 +78,44 @@ type BlockchainIngesterMock struct {
 			Payloads []models.RPCBlock
 		}
 	}
+	lockGetBlockGaps       sync.RWMutex
 	lockGetProgressReport  sync.RWMutex
 	lockPostProgressReport sync.RWMutex
 	lockSendBlocks         sync.RWMutex
 }
 
+// GetBlockGaps calls GetBlockGapsFunc.
+func (mock *BlockchainIngesterMock) GetBlockGaps(ctx context.Context) (*models.BlockchainGaps, error) {
+	if mock.GetBlockGapsFunc == nil {
+		panic("BlockchainIngesterMock.GetBlockGapsFunc: method is nil but BlockchainIngester.GetBlockGaps was just called")
+	}
+	callInfo := struct {
+		Ctx context.Context
+	}{
+		Ctx: ctx,
+	}
+	mock.lockGetBlockGaps.Lock()
+	mock.calls.GetBlockGaps = append(mock.calls.GetBlockGaps, callInfo)
+	mock.lockGetBlockGaps.Unlock()
+	return mock.GetBlockGapsFunc(ctx)
+}
+
+// GetBlockGapsCalls gets all the calls that were made to GetBlockGaps.
+// Check the length with:
+//
+//	len(mockedBlockchainIngester.GetBlockGapsCalls())
+func (mock *BlockchainIngesterMock) GetBlockGapsCalls() []struct {
+	Ctx context.Context
+} {
+	var calls []struct {
+		Ctx context.Context
+	}
+	mock.lockGetBlockGaps.RLock()
+	calls = mock.calls.GetBlockGaps
+	mock.lockGetBlockGaps.RUnlock()
+	return calls
+}
+
 // GetProgressReport calls GetProgressReportFunc.
 func (mock *BlockchainIngesterMock) GetProgressReport(ctx context.Context) (*models.BlockchainIndexProgress, error) {
 	if mock.GetProgressReportFunc == nil {
diff --git a/models/gaps.go b/models/gaps.go
new file mode 100644
index 0000000..cc17645
--- /dev/null
+++ b/models/gaps.go
@@ -0,0 +1,10 @@
+package models
+
+type BlockchainGaps struct {
+	Gaps []BlockGap
+}
+
+type BlockGap struct {
+	FirstMissing int64
+	LastMissing  int64
+}