Skip to content

Commit

Permalink
WIP: string array support
Browse files Browse the repository at this point in the history
  • Loading branch information
terraputix committed Feb 5, 2025
1 parent d46b46f commit 7e30980
Show file tree
Hide file tree
Showing 10 changed files with 342 additions and 26 deletions.
7 changes: 6 additions & 1 deletion Swift/OmFileFormat/OmFileFormat.swift
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,12 @@ public enum CompressionType: UInt8, Codable {
/// PFor integer compression. Floating point values are scaled to 32 bit signed integers. Doubles are scaled to 64 bit signed integers.
case pfor_delta2d = 2

/// Similar to `pfor_delta2d_int16` but applies `log10(1+x)` before
/// Similar to `pfor_delta2d_int16` but applies `log10(1+x)` before
case pfor_delta2d_int16_logarithmic = 3

/// No compression, currently only supported for string data
case none = 255

func toC() -> OmCompression_t {
switch self {
case .pfor_delta2d_int16:
Expand All @@ -76,6 +79,8 @@ public enum CompressionType: UInt8, Codable {
return COMPRESSION_PFOR_DELTA2D
case .pfor_delta2d_int16_logarithmic:
return COMPRESSION_PFOR_DELTA2D_INT16_LOGARITHMIC
case .none:
return COMPRESSION_NONE
}
}
}
119 changes: 118 additions & 1 deletion Swift/OmFileFormat/OmFileReader.swift
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,20 @@ public struct OmFileReader<Backend: OmFileReaderBackend> {
io_size_merge: io_size_merge
)
}

/// Convert to string array reader if the variable is a string array
public func asStringArray(io_size_max: UInt64 = 65536, io_size_merge: UInt64 = 512) -> OmFileReaderStringArray<Backend>? {
guard dataType == .string_array else {
return nil
}

return OmFileReaderStringArray(
fn: fn,
variable: variable,
io_size_max: io_size_max,
io_size_merge: io_size_merge
)
}
}

extension OmFileReader where Backend == MmapFile {
Expand Down Expand Up @@ -336,6 +350,109 @@ public struct OmFileReaderArray<Backend: OmFileReaderBackend, OmType: OmFileArra
}
}

/// Specialized reader for string arrays
public struct OmFileReaderStringArray<Backend: OmFileReaderBackend> {
/// Points to the underlying memory
public let fn: Backend
let variable: UnsafePointer<OmVariable_t?>?
let io_size_max: UInt64
let io_size_merge: UInt64

/// Get the dimensions of the string array
public func getDimensions() -> UnsafeBufferPointer<UInt64> {
let dimensions = om_variable_get_dimensions(variable)
return UnsafeBufferPointer<UInt64>(start: dimensions.values, count: Int(dimensions.count))
}

public func getLutTable() -> [UInt64] {
guard let variable = self.variable else {
fatalError("Variable is nil")
}
if case let meta = UnsafeRawPointer(variable).assumingMemoryBound(to: OmVariableArrayV3_t.self).pointee,
case .string_array = DataType(rawValue: UInt8(om_variable_get_type(variable).rawValue)) {
let lutOffset = meta.lut_offset
let lutSize = meta.lut_size
print("lutOffset \(lutOffset) lutSize \(lutSize)")
let lutPtr = self.fn.getData(offset: Int(lutOffset), count: Int(lutSize)).assumingMemoryBound(to: UInt64.self)
let buffer = UnsafeBufferPointer(start: lutPtr, count: Int(lutSize/8))
return Array(buffer)
}
return []
}

/// Read the entire string array
public func read() throws -> [String] {
let dimensions = self.getDimensions()
let ranges = dimensions.map { 0..<$0 }
return try read(range: ranges)
}

/// Read a subset of the string array
public func read(range: [Range<UInt64>]) throws -> [String] {
let dimensions = self.getDimensions().map { Int($0) }
let ranges = range.map { Range(uncheckedBounds: (Int($0.lowerBound), Int($0.upperBound))) }
let totalCount = ranges.map { $0.count }.reduce(1, *)
let lutTable = self.getLutTable()
print("lutTable \(lutTable)")

var strings = [String]()
strings.reserveCapacity(Int(totalCount))

// We need to translate the array indices to linear indices
// according to row-major order
// Create array to hold current indices
var currentIndices = ranges.map { $0.lowerBound }

// Row-major iteration (leftmost dimension changes slowest)
outer: while true {
// Calculate linear index for current position
var linearIndex = 0
var multiplier = 1

// Calculate linear index in row-major order (rightmost dimension is fastest)
for (idx, dim) in dimensions.enumerated().reversed() {
linearIndex += currentIndices[idx] * multiplier
multiplier *= dim
}

// The LUT at the linear index contains the offset of the string
// The next LUT entry contains the offset of the next string
// The length of the string is the difference between the two offsets
let startOffset = lutTable[Int(linearIndex)]
let endOffset = lutTable[Int(linearIndex + 1)]


// Process current position
print("Read string at \(startOffset) - \(endOffset)")
strings.append(try self.readString(start: Int(startOffset), end: Int(endOffset)))

// Increment indices starting from rightmost dimension
for dimIdx in (0..<dimensions.count).reversed() {
currentIndices[dimIdx] += 1
if currentIndices[dimIdx] < ranges[dimIdx].upperBound {
break // If we haven't reached the end of this dimension, continue
}
if dimIdx == 0 {
break outer // If we've processed all dimensions, we're done
}
// Reset this dimension and continue to increment the next one
currentIndices[dimIdx] = ranges[dimIdx].lowerBound
}
}

return strings
}

/// Read a single string from the specified start and end offset
public func readString(start: Int, end: Int) throws -> String {
let stringData = self.fn.getData(offset: start, count: end - start)
let buffer = UnsafeRawBufferPointer(start: stringData, count: end - start)

// Convert to String using the buffer
return String(bytes: buffer, encoding: .utf8) ?? ""
}
}

extension OmFileReaderBackend {
/// Read and decode
func decode(decoder: UnsafePointer<OmDecoder_t>, into: UnsafeMutableRawPointer) throws {
Expand All @@ -354,7 +471,7 @@ extension OmFileReaderBackend {
om_decoder_init_data_read(&dataRead, &indexRead)

var error: OmError_t = ERROR_OK
/// Loop over data blocks and read compressed data chunks
// Loop over data blocks and read compressed data chunks
while om_decoder_next_data_read(decoder, &dataRead, indexData, indexRead.count, &error) {
//print("Read data \(dataRead) for chunk index \(dataRead.chunkIndex)")
let dataData = self.getData(offset: Int(dataRead.offset), count: Int(dataRead.count))
Expand Down
81 changes: 80 additions & 1 deletion Swift/OmFileFormat/OmFileWriter.swift
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@ public struct OmFileWriter<FileHandle: OmFileWriterBackend> {
return try .init(dimensions: dimensions, chunkDimensions: chunkDimensions, compression: compression, scale_factor: scale_factor, add_offset: add_offset, buffer: buffer)
}

public func prepareStringArray(dimensions: [UInt64]) throws -> OmFileWriterStringArray<FileHandle> {
try writeHeaderIfRequired()
return .init(dimensions: dimensions, buffer: buffer)
}

public func write(array: OmFileWriterArrayFinalised, name: String, children: [OmOffsetSize]) throws -> OmOffsetSize {
try writeHeaderIfRequired()
guard array.dimensions.count == array.chunks.count else {
Expand Down Expand Up @@ -157,6 +162,13 @@ public final class OmFileWriterArray<OmType: OmFileArrayDataTypeProtocol, FileHa
self.scale_factor = scale_factor
self.add_offset = add_offset

if OmType.dataTypeArray == .string_array {
// string arrays need to have chunk dimensions of 1
if !chunkDimensions.allSatisfy({ $0 == 1 }) {
throw OmFileFormatSwiftError.omEncoder(error: "String arrays need to have chunk dimensions of 1")
}
}

// Note: The encoder keeps the pointer to `&self.dimensions`. It is important that this array is not deallocated!
self.encoder = OmEncoder_t()
let error = om_encoder_init(&encoder, scale_factor, add_offset, compression.toC(), OmType.dataTypeArray.toC(), &self.dimensions, &self.chunks, UInt64(dimensions.count))
Expand Down Expand Up @@ -214,7 +226,7 @@ public final class OmFileWriterArray<OmType: OmFileArrayDataTypeProtocol, FileHa
let numberOfChunksInArray = om_encoder_count_chunks_in_array(&encoder, arrayCount)

/// Store data start address if this is the first time this read is called
if chunkIndex == 0 {
if self.chunkIndex == 0 {
lookUpTable[chunkIndex] = UInt64(buffer.totalBytesWritten)
}

Expand Down Expand Up @@ -295,6 +307,73 @@ public struct OmFileWriterArrayFinalised {
let lutOffset: UInt64
}

/// Specialized string array writer
public final class OmFileWriterStringArray<FileHandle: OmFileWriterBackend> {
private var lookUpTable: [UInt64]
private var currentPosition: UInt64
private let buffer: OmBufferedWriter<FileHandle>
private let dimensions: [UInt64]

public init(dimensions: [UInt64], buffer: OmBufferedWriter<FileHandle>) {
self.dimensions = dimensions
// Allocate space for a lookup table. Needs to be number_of_chunks+1 to store start address and for each chunk then end address
self.lookUpTable = .init(repeating: 0, count: Int(dimensions.reduce(1, *) + 1))
self.currentPosition = 0
self.buffer = buffer
}

public func writeData(array: [String]) throws {
// Verify array size matches dimensions
let expectedSize = dimensions.reduce(1, *)
guard array.count == expectedSize && self.currentPosition == 0 else {
throw OmFileFormatSwiftError.omEncoder(error: "String arrays need to be encoded all at once and must match the dimensions")
}

let lutOffset = UInt64(buffer.totalBytesWritten)
if self.currentPosition == 0 {
lookUpTable[0] = lutOffset
}

// Pre-calculate total required capacity
let totalCapacity = array.reduce(0) { $0 + $1.utf8.count }
try buffer.reallocate(minimumCapacity: totalCapacity)

// Write all strings consecutively
for (i, string) in array.enumerated() {
string.utf8.withContiguousStorageIfAvailable { utf8 in
buffer.bufferAtWritePosition.advanced(by: Int(self.currentPosition))
.copyMemory(from: utf8.baseAddress!, byteCount: utf8.count)
self.currentPosition += UInt64(utf8.count)
lookUpTable[i+1] = lutOffset + self.currentPosition
}
}

buffer.incrementWritePosition(by: Int(self.currentPosition))
}

public func finalise() throws -> OmFileWriterArrayFinalised {
try buffer.alignTo64Bytes()
let lutOffset = buffer.totalBytesWritten

// Write uncompressed LUT
let lutSize = Int(lookUpTable.count * MemoryLayout<UInt64>.size)
try buffer.reallocate(minimumCapacity: lutSize)
buffer.bufferAtWritePosition.copyMemory(from: self.lookUpTable, byteCount: lutSize)
buffer.incrementWritePosition(by: lutSize)

return OmFileWriterArrayFinalised(
scale_factor: 0,
add_offset: 0,
compression: .none,
datatype: .string_array,
dimensions: dimensions,
chunks: dimensions.map { _ in 1 }, // one string per chunk
lutSize: UInt64(lutSize),
lutOffset: UInt64(lutOffset)
)
}
}

/// Wrapper for the internal C structure to keep offset and size
public struct OmOffsetSize {
let offset: UInt64
Expand Down
80 changes: 79 additions & 1 deletion Tests/OmFileFormatTests/OmFileFormatTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,26 @@ import Foundation
#expect(bytes[40..<40+17] == [5, 4, 5, 0, 0, 0, 0, 0, 82, 9, 188, 0, 105, 110, 116, 51, 50]) // scalar int32
#expect(bytes[65..<65+22] == [4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 64, 42, 129, 103, 65, 100, 111, 117, 98, 108, 101, 0]) // scalar double
#expect(bytes[88..<88+34] == [11, 4, 6, 0, 0, 0, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 109, 121, 95, 97, 116, 116, 114, 105, 98, 117, 116, 101, 115, 116, 114, 105, 110, 103]) // scalar string
#expect(bytes[128..<128+140] == [20, 0, 4, 0, 3, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 30, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 63, 0, 0, 0, 0, 17, 0, 0, 0, 0, 0, 0, 0, 22, 0, 0, 0, 0, 0, 0, 0, 34, 0, 0, 0, 0, 0, 0, 0, 40, 0, 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, 88, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 100, 97, 116, 97]) // array meta
#expect(bytes[128..<128+140] == [
20, 0, 4, 0, 3, 0, 0, 0,
5, 0, 0, 0, 0, 0, 0, 0,
30, 0, 0, 0, 0, 0, 0, 0,
3, 0, 0, 0, 0, 0, 0, 0,
0, 0, 128, 63, 0, 0, 0, 0,
17, 0, 0, 0, 0, 0, 0, 0,
22, 0, 0, 0, 0, 0, 0, 0,
34, 0, 0, 0, 0, 0, 0, 0,
40, 0, 0, 0, 0, 0, 0, 0,
64, 0, 0, 0, 0, 0, 0, 0,
88, 0, 0, 0, 0, 0, 0, 0,
3, 0, 0, 0, 0, 0, 0, 0,
3, 0, 0, 0, 0, 0, 0, 0,
3, 0, 0, 0, 0, 0, 0, 0,
2, 0, 0, 0, 0, 0, 0, 0,
2, 0, 0, 0, 0, 0, 0, 0,
2, 0, 0, 0, 0, 0, 0, 0,
100, 97, 116, 97
]) // array meta
#expect(bytes[272..<296] == [79, 77, 3, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0, 0, 140, 0, 0, 0, 0, 0, 0, 0]) // trailer

// Test interpolation
Expand All @@ -325,6 +344,65 @@ import Foundation
#expect(try read.readInterpolated(dim0X: 0, dim0XFraction: 0.8, dim0Y: 0, dim0YFraction: 0.9, dim0Nx: 3, dim1: 0..<3) == [10.5, 11.5, 12.5])
}

@Test func writeStringArray() throws {
let file = "writeStringArray.om"
let fn = try FileHandle.createNewFile(file: file, overwrite: true)
defer { try? FileManager.default.removeItem(atPath: file) }
let fileWriter = OmFileWriter(fn: fn, initialCapacity: 8)

let writer = try fileWriter.prepareStringArray(dimensions: [3,3])

let data = ["string1", "string2", "string3", "string4", "string5", "string6", "string7", "string8_äöüß¿¡!?", "string9____"]
try writer.writeData(array: data)
let variableMeta = try writer.finalise()
let variable = try fileWriter.write(array: variableMeta, name: "data", children: [])
try fileWriter.writeTrailer(rootVariable: variable)

let readFn = try MmapFile(fn: FileHandle.openFileReading(file: file))
#expect(readFn.count == 280)
let bytes = Data(bytesNoCopy: UnsafeMutableRawPointer(mutating: readFn.getData(offset: 0, count: readFn.count)), count: readFn.count, deallocator: .none).map{ UInt8($0) }
#expect(bytes[0..<3] == [79, 77, 3])
#expect(bytes[3..<3+7] == [115, 116, 114, 105, 110, 103, 49]) // string1
#expect(bytes[10..<10+7] == [115, 116, 114, 105, 110, 103, 50]) // string2
#expect(bytes[17..<17+7] == [115, 116, 114, 105, 110, 103, 51]) // string3
#expect(bytes[24..<24+7] == [115, 116, 114, 105, 110, 103, 52]) // string4
#expect(bytes[31..<31+7] == [115, 116, 114, 105, 110, 103, 53]) // string5
#expect(bytes[38..<38+7] == [115, 116, 114, 105, 110, 103, 54]) // string6
#expect(bytes[45..<45+7] == [115, 116, 114, 105, 110, 103, 55]) // string7
#expect(bytes[52..<52+22] == [115, 116, 114, 105, 110, 103, 56, 95, 195, 164, 195, 182, 195, 188, 195, 159, 194, 191, 194, 161, 33, 63]) // string8_äöüß¿¡!?
#expect(bytes[74..<74+11] == [115, 116, 114, 105, 110, 103, 57, 95, 95, 95, 95]) // string9____
#expect(bytes[88..<88+80] == [
3, 0, 0, 0, 0, 0, 0, 0,
10, 0, 0, 0, 0, 0, 0, 0,
17, 0, 0, 0, 0, 0, 0, 0,
24, 0, 0, 0, 0, 0, 0, 0,
31, 0, 0, 0, 0, 0, 0, 0,
38, 0, 0, 0, 0, 0, 0, 0,
45, 0, 0, 0, 0, 0, 0, 0,
52, 0, 0, 0, 0, 0, 0, 0,
74, 0, 0, 0, 0, 0, 0, 0,
85, 0, 0, 0, 0, 0, 0, 0
]) // LUT
#expect(bytes[88+88..<88+88+76] == [
22, 4, 4, 0, 0, 0, 0, 0,
80, 0, 0, 0, 0, 0, 0, 0,
88, 0, 0, 0, 0, 0, 0, 0,
2, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
3, 0, 0, 0, 0, 0, 0, 0,
3, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0,
100, 97, 116, 97
]) // array meta
#expect(bytes[280-24..<280] == [79, 77, 3, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, 0, 0, 76, 0, 0, 0, 0, 0, 0, 0]) // trailer


let read = try OmFileReader(fn: readFn).asStringArray()!
let a = try read.read(range: [0..<3, 0..<3])
#expect(a == data)
}

@Test func writev3() throws {
let file = "writev3.om"
let dims = [UInt64(5),5]
Expand Down
3 changes: 3 additions & 0 deletions c/include/om_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,9 @@ void om_common_copy16(uint64_t length, float scale_factor, float add_offset, con
void om_common_copy32(uint64_t length, float scale_factor, float add_offset, const void* src, void* dst);
void om_common_copy64(uint64_t length, float scale_factor, float add_offset, const void* src, void* dst);

/// Copy string array
void om_common_copy_string_array(uint64_t length, const void* src, void* dst);

uint64_t om_common_compress_fpxenc32(const void* src, uint64_t length, void* dst);
uint64_t om_common_compress_fpxenc64(const void* src, uint64_t length, void* dst);
uint64_t om_common_decompress_fpxdec32(const void* src, uint64_t length, void* dst);
Expand Down
2 changes: 1 addition & 1 deletion c/include/om_variable.h
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ typedef enum {
OM_MEMORY_LAYOUT_ARRAY = 1,
OM_MEMORY_LAYOUT_SCALAR = 3,
//OM_MEMORY_LAYOUT_STRING = 4,
//OM_MEMORY_LAYOUT_STRING_ARRAY = 5,
OM_MEMORY_LAYOUT_STRING_ARRAY = 5,
} OmMemoryLayout_t;

/// Check if a variable is legacy or version 3 array of scalar. Legacy files are the entire header containing magic number and version.
Expand Down
Loading

0 comments on commit 7e30980

Please sign in to comment.