Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lazily parse index per-chromosome #108

Merged
merged 2 commits into from
Feb 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 75 additions & 19 deletions src/bai.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import QuickLRU from 'quick-lru'

import Chunk from './chunk'
import IndexFile from './indexFile'
import { BaseOpts, findFirstData, optimizeChunks, parsePseudoBin } from './util'
import VirtualOffset, { fromBytes } from './virtualOffset'
import { VirtualOffset, fromBytes } from './virtualOffset'

const BAI_MAGIC = 21578050 // BAI\1

Expand All @@ -12,6 +14,12 @@ function roundUp(n: number, multiple: number) {
return n - (n % multiple) + multiple
}

export interface IndexCovEntry {
start: number
end: number
score: number
}

function reg2bins(beg: number, end: number) {
end -= 1
return [
Expand All @@ -29,10 +37,9 @@ export default class BAI extends IndexFile {

async lineCount(refId: number, opts?: BaseOpts) {
const indexData = await this.parse(opts)
return indexData.indices[refId]?.stats?.lineCount || 0
return indexData.indices(refId)?.stats?.lineCount || 0
}

// fetch and parse the index
async _parse(_opts?: BaseOpts) {
const bytes = await this.filehandle.readFile()
const dataView = new DataView(bytes.buffer)
Expand All @@ -50,17 +57,53 @@ export default class BAI extends IndexFile {
let curr = 8
let firstDataLine: VirtualOffset | undefined

type BinIndex = Record<string, Chunk[]>
type LinearIndex = VirtualOffset[]
const indices = new Array<{
binIndex: BinIndex
linearIndex: LinearIndex
stats?: { lineCount: number }
}>(refCount)

const offsets = [] as number[]
for (let i = 0; i < refCount; i++) {
// the binning index
offsets.push(curr)
const binCount = dataView.getInt32(curr, true)

curr += 4

for (let j = 0; j < binCount; j += 1) {
const bin = dataView.getUint32(curr, true)
curr += 4
if (bin === binLimit + 1) {
curr += 4
curr += 32
} else if (bin > binLimit + 1) {
throw new Error('bai index contains too many bins, please use CSI')
} else {
const chunkCount = dataView.getInt32(curr, true)
curr += 4
for (let k = 0; k < chunkCount; k++) {
curr += 8
curr += 8
}
}
}

const linearCount = dataView.getInt32(curr, true)
curr += 4
// as we're going through the linear index, figure out the smallest
// virtual offset in the indexes, which tells us where the BAM header
// ends
const linearIndex = new Array<VirtualOffset>(linearCount)
for (let j = 0; j < linearCount; j++) {
const offset = fromBytes(bytes, curr)
curr += 8
firstDataLine = findFirstData(firstDataLine, offset)
linearIndex[j] = offset
}
}
const indicesCache = new QuickLRU<number, ReturnType<typeof getIndices>>({
maxSize: 5,
})

function getIndices(refId: number) {
let curr = offsets[refId]
if (curr === undefined) {
return undefined
}
const binCount = dataView.getInt32(curr, true)
let stats

Expand Down Expand Up @@ -105,14 +148,27 @@ export default class BAI extends IndexFile {
linearIndex[j] = offset
}

indices[i] = { binIndex, linearIndex, stats }
return {
binIndex,
linearIndex,
stats,
}
}

return {
bai: true,
firstDataLine,
maxBlockSize: 1 << 16,
indices,
indices: (refId: number) => {
if (!indicesCache.has(refId)) {
const result = getIndices(refId)
if (result) {
indicesCache.set(refId, result)
}
return result
}
return indicesCache.get(refId)
},
refCount,
}
}
Expand All @@ -121,12 +177,12 @@ export default class BAI extends IndexFile {
seqId: number,
start?: number,
end?: number,
opts: BaseOpts = {},
): Promise<{ start: number; end: number; score: number }[]> {
opts?: BaseOpts,
): Promise<IndexCovEntry[]> {
const v = 16384
const range = start !== undefined
const indexData = await this.parse(opts)
const seqIdx = indexData.indices[seqId]
const seqIdx = indexData.indices(seqId)

if (!seqIdx) {
return []
Expand Down Expand Up @@ -174,7 +230,7 @@ export default class BAI extends IndexFile {
if (!indexData) {
return []
}
const ba = indexData.indices[refId]
const ba = indexData.indices(refId)

if (!ba) {
return []
Expand Down Expand Up @@ -225,6 +281,6 @@ export default class BAI extends IndexFile {

async hasRefSeq(seqId: number, opts: BaseOpts = {}) {
const header = await this.parse(opts)
return !!header.indices[seqId]?.binIndex
return !!header.indices(seqId)?.binIndex
}
}
1 change: 0 additions & 1 deletion src/bamFile.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import crc32 from 'crc/calculators/crc32'
import { LocalFile, RemoteFile } from 'generic-filehandle2'
import QuickLRU from 'quick-lru'

// locals
import BAI from './bai'
import Chunk from './chunk'
import CSI from './csi'
Expand Down
6 changes: 3 additions & 3 deletions src/chunk.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import VirtualOffset from './virtualOffset'
import { Offset } from './virtualOffset'

// little class representing a chunk in the index
export default class Chunk {
public buffer?: Uint8Array

constructor(
public minv: VirtualOffset,
public maxv: VirtualOffset,
public minv: Offset,
public maxv: Offset,
public bin: number,
public _fetchedSize?: number,
) {}
Expand Down
64 changes: 51 additions & 13 deletions src/csi.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { unzip } from '@gmod/bgzf-filehandle'
import QuickLRU from 'quick-lru'

import Chunk from './chunk'
import IndexFile from './indexFile'
Expand All @@ -9,7 +10,7 @@ import {
parseNameBytes,
parsePseudoBin,
} from './util'
import VirtualOffset, { fromBytes } from './virtualOffset'
import { VirtualOffset, fromBytes } from './virtualOffset'

const CSI1_MAGIC = 21582659 // CSI\1
const CSI2_MAGIC = 38359875 // CSI\2
Expand All @@ -30,7 +31,7 @@ export default class CSI extends IndexFile {

async lineCount(refId: number, opts?: BaseOpts) {
const indexData = await this.parse(opts)
return indexData.indices[refId]?.stats?.lineCount || 0
return indexData.indices(refId)?.stats?.lineCount || 0
}

async indexCov() {
Expand Down Expand Up @@ -94,20 +95,45 @@ export default class CSI extends IndexFile {
this.minShift = dataView.getInt32(4, true)
this.depth = dataView.getInt32(8, true)
this.maxBinNumber = ((1 << ((this.depth + 1) * 3)) - 1) / 7
const maxBinNumber = this.maxBinNumber
const auxLength = dataView.getInt32(12, true)
const aux = auxLength >= 30 ? this.parseAuxData(bytes, 16) : undefined
const refCount = dataView.getInt32(16 + auxLength, true)

type BinIndex = Record<string, Chunk[]>

// read the indexes for each reference sequence
let curr = 16 + auxLength + 4
let firstDataLine: VirtualOffset | undefined
const indices = new Array<{
binIndex: BinIndex
stats?: { lineCount: number }
}>(refCount)
const offsets = [] as number[]
for (let i = 0; i < refCount; i++) {
offsets.push(curr)
const binCount = dataView.getInt32(curr, true)
curr += 4
for (let j = 0; j < binCount; j++) {
const bin = dataView.getUint32(curr, true)
curr += 4
if (bin > this.maxBinNumber) {
curr += 28 + 16
} else {
curr += 8
const chunkCount = dataView.getInt32(curr, true)
curr += 4
for (let k = 0; k < chunkCount; k += 1) {
curr += 8
curr += 8
}
}
}
}

const indicesCache = new QuickLRU<number, ReturnType<typeof getIndices>>({
maxSize: 5,
})

function getIndices(refId: number) {
let curr = offsets[refId]
if (curr === undefined) {
return undefined
}
// the binning index
const binCount = dataView.getInt32(curr, true)
curr += 4
Expand All @@ -116,7 +142,7 @@ export default class CSI extends IndexFile {
for (let j = 0; j < binCount; j++) {
const bin = dataView.getUint32(curr, true)
curr += 4
if (bin > this.maxBinNumber) {
if (bin > maxBinNumber) {
stats = parsePseudoBin(bytes, curr + 28)
curr += 28 + 16
} else {
Expand All @@ -137,13 +163,25 @@ export default class CSI extends IndexFile {
}
}

indices[i] = { binIndex, stats }
return {
binIndex,
stats,
}
}

return {
csiVersion,
firstDataLine,
indices,
indices: (refId: number) => {
if (!indicesCache.has(refId)) {
const result = getIndices(refId)
if (result) {
indicesCache.set(refId, result)
}
return result
}
return indicesCache.get(refId)
},
refCount,
csi: true,
maxBlockSize: 1 << 16,
Expand All @@ -162,7 +200,7 @@ export default class CSI extends IndexFile {
}

const indexData = await this.parse(opts)
const ba = indexData.indices[refId]
const ba = indexData.indices(refId)

if (!ba) {
return []
Expand Down Expand Up @@ -231,6 +269,6 @@ export default class CSI extends IndexFile {

async hasRefSeq(seqId: number, opts: BaseOpts = {}) {
const header = await this.parse(opts)
return !!header.indices[seqId]?.binIndex
return !!header.indices(seqId)?.binIndex
}
}
3 changes: 1 addition & 2 deletions src/indexFile.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

import Chunk from './chunk'
import { BaseOpts } from './util'

Expand All @@ -10,7 +9,7 @@ export default abstract class IndexFile {

/**
* @param {filehandle} filehandle
* @param {function} [renameRefSeqs]
* @param {function} renameRefSeqs
*/
constructor({
filehandle,
Expand Down
1 change: 0 additions & 1 deletion src/nullFilehandle.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

export default class NullFilehandle {
public read(): Promise<any> {
throw new Error('never called')
Expand Down
7 changes: 2 additions & 5 deletions src/util.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import Chunk from './chunk'
import { longFromBytesToUnsigned } from './long'
import VirtualOffset from './virtualOffset'
import { Offset, VirtualOffset } from './virtualOffset'

export function timeout(ms: number) {
return new Promise(resolve => setTimeout(resolve, ms))
Expand Down Expand Up @@ -69,7 +69,7 @@ export function makeOpts(obj: AbortSignal | BaseOpts = {}): BaseOpts {
return 'aborted' in obj ? ({ signal: obj } as BaseOpts) : obj
}

export function optimizeChunks(chunks: Chunk[], lowest?: VirtualOffset) {
export function optimizeChunks(chunks: Chunk[], lowest?: Offset) {
const mergedChunks: Chunk[] = []
let lastChunk: Chunk | undefined

Expand Down Expand Up @@ -163,13 +163,10 @@ export function concatUint8Array(args: Uint8Array[]) {
return mergedArray
}


export async function gen2array<T>(gen: AsyncIterable<T[]>): Promise<T[]> {
let out: T[] = []
for await (const x of gen) {
out = out.concat(x)
}
return out
}


9 changes: 8 additions & 1 deletion src/virtualOffset.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
export default class VirtualOffset {
export interface Offset {
blockPosition: number
dataPosition: number
toString(): string
compareTo(arg: Offset): number
}

export class VirtualOffset {
public blockPosition: number
public dataPosition: number
constructor(blockPosition: number, dataPosition: number) {
Expand Down
Loading