Skip to content

Commit

Permalink
(Mostly) Bug Fixes for writing Strings (#35)
Browse files Browse the repository at this point in the history
- Fix String write for when `length(str) != ncodeunits(str)`
- `write(path::String, table)` now returns `path` instead of number of bytes written.
- Support for writing `Char`
- Attempting to write a String >254 bytes is now an error (previously a warning)
- Change tests to depend on tables created within the tests as opposed to a .dbf file.
- Added quirks/gotchas to README
  • Loading branch information
joshday authored Jun 18, 2024
1 parent f7768b5 commit e5d4c37
Show file tree
Hide file tree
Showing 5 changed files with 113 additions and 134 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "DBFTables"
uuid = "75c7ada1-017a-5fb6-b8c7-2125ff2d6c93"
version = "1.2.5"
version = "1.2.6"

[deps]
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
Expand Down
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,17 @@ The DBF header contains information on the amount of rows, which columns are pre
The `DBFTables.Table` struct holds both the header and data. All data is read into memory in one go as a `Vector{UInt8}`. To provide efficient access into the individual entries, we use [WeakRefStrings](https://github.com/JuliaData/WeakRefStrings.jl/). WeakRefStrings' `StringArray` only holds the offsets and lengths into the `Vector{UInt8}` with all the data. Then we still need to convert from the string to the julia type. This is done on demand with `dbf_value`.

Note that the format also contains a "record deleted" flag, which is represented by a `'*'` at the start of the row. When this is encountered the record should be treated as if it doesn't exist. Since normally writers strip these records when writing, they are rarely encountered. For that reason this package ignores these flags by default right now. To check for the flags yourself, there is the `isdeleted` function. A sample file with deleted record flags is available [here](https://issues.qgis.org/issues/11007#note-30).


## Quirks and Gotchas

The DBF format is quite old (introduced in 1983). As such, it has some quirks that may not be immediately obvious:

1. An empty string is equivalent to a missing value. Thus an empty string in a table will not survive a `write`/`read` round trip.
2. Strings are limited to 254 characters. Attempting to write longer Strings results in an error.
3. In order to support as many versions of DBF as possible, DBFTables.jl will only write data as one of the following DBF data types:
- `'C'` (Character): `String`s (and anything else that doesn't doesn't match one of the other three types).
- `'N'` (Numeric): `Integer`s and `AbstractFloat`s.
- `'L'` (Logical): `Bool`s.
- `'D'` (Date): `Date`s.
4. The `'N` (Numeric) data type restricts values to fit within 20 printed characters. All `Int64`s fit within 20 characters, but `Float64`s may not. E.g. `string(nextfloat(-Inf))` is 23 characters. DBFTables.jl will remove the least significant digits (loss of precision) in order to fit within the 20 character limit.
54 changes: 19 additions & 35 deletions src/DBFTables.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,9 @@ function FieldDescriptor(name::Symbol, data::AbstractVector)
elseif T === Union{} # data is only missings
len = 0x01
elseif char === 'C'
width = T <: AbstractString ? maximum(length, itr) : maximum(x -> length(string(x)), itr)
if width > 254
@warn "Due to DBF limitations, strings in field $name will be truncated to 254 characters."
len = UInt8(254)
else
len = UInt8(width)
end
width = maximum(x -> ncodeunits(string(x)), itr)
width > 254 && error("String data must be <254 characters due to DBF limitations. Found: $width.")
len = UInt8(width)
elseif char === 'N'
len = UInt8(20)
ndec = T <: AbstractFloat ? 0x1 : 0x0
Expand Down Expand Up @@ -73,7 +69,7 @@ end
#-----------------------------------------------------------------------# conversions: Julia-to-DBF
# These are the only types DBFTables.jl will use to save data as.
"Get the DBF type code from the Julia type. Assumes `Base.nonmissingtype(T)` is the input."
dbf_type(::Type{<:AbstractString}) = 'C'
dbf_type(::Type{<:Union{Char, AbstractString}}) = 'C'
dbf_type(::Type{Bool}) = 'L'
dbf_type(::Type{<:Integer}) = 'N'
dbf_type(::Type{<:AbstractFloat}) = 'N'
Expand All @@ -88,8 +84,9 @@ dbf_value(field::FieldDescriptor, val) = dbf_value(Val(field.dbf_type), field.le

# String (or any other type that gets mapped to 'C')
function dbf_value(::Val{'C'}, len::UInt8, x)
out = replace(rpad(x, len), !isascii => x -> '_' ^ textwidth(x))
length(out) > 254 ? out[1:254] : out
s = string(x)
out = s * ' ' ^ (len - ncodeunits(s))
ncodeunits(out) > 254 ? error("The DBF format cannot save strings >254 characters.") : out
end
dbf_value(::Val{'C'}, len::UInt8, ::Missing) = ' ' ^ len

Expand All @@ -111,7 +108,7 @@ function dbf_value(::Val{'N'}, ::UInt8, x::AbstractFloat)
s2 = @sprintf "%.20e" x
i = findfirst('e', s2)
s_end = replace(s2[i:end], '+' => "")
len = length(s_end)
len = length(s_end)
n = 20 - len
out = s2[1:n] * s_end
@warn "A DBF limitation has reduced the precision of $x by $(length(s) - 20) digits."
Expand Down Expand Up @@ -145,8 +142,8 @@ end
julia_value(o::FieldDescriptor, s::AbstractString) = julia_value(o.type, Val(o.dbf_type), s::AbstractString)

function julia_value_string(s::AbstractString)
s2 = strip(s)
isempty(s2) ? missing : String(s2)
out = strip(x -> isspace(x) || x == '\0', s)
isempty(out) ? missing : out
end

julia_value(::Type{String}, ::Val{'C'}, s::AbstractString) = julia_value_string(s)
Expand Down Expand Up @@ -192,19 +189,14 @@ end

"Read a field descriptor from the stream, and create a FieldDescriptor struct"
function read_dbf_field(io::IO)
n_bytes_field_name = 11 # field name can be up to 11 bytes long, delimited by '\0' (end of string, EOS)
field_name_bytes = read(io, n_bytes_field_name)
pos_eos = findfirst(iszero, field_name_bytes)
n = pos_eos === nothing ? n_bytes_field_name : pos_eos - 1
field_name = Symbol(field_name_bytes[1:n])

field_type = read(io, Char)
name = Symbol(filter!(!iszero, read(io, 11))) # 11 bytes padded by '\0'
dbf_type = read(io, Char)
skip(io, 4) # skip
field_len = read(io, UInt8)
field_dec = read(io, UInt8)
length = read(io, UInt8)
ndec = read(io, UInt8)
skip(io, 14) # reserved
jltype = julia_type(Val(field_type), field_dec)
return FieldDescriptor(field_name, jltype, field_type, field_len, field_dec)
type = julia_type(Val(dbf_type), ndec)
return FieldDescriptor(name, type, dbf_type, length, ndec)
end

reserved(n) = fill(0x00, n)
Expand Down Expand Up @@ -247,15 +239,7 @@ function Header(io::IO)
fieldcolumns[field.name] = col
push!(fields, field)
col += 1

# peek if we are at the end
mark(io)
trm = read(io, UInt8)
if trm == 0xD
break
else
reset(io)
end
peek(io) == 0x0d && break
end

return Header(
Expand Down Expand Up @@ -332,7 +316,7 @@ function Table(io::IO)
# Make sure data is read at the right position
bytes_to_skip = header.hsize - position(io)
bytes_to_skip > 0 && skip(io, bytes_to_skip)

data = Vector{UInt8}(undef, header.rsize * header.records)
read!(io, data)
strings = _create_stringarray(header, data)
Expand Down Expand Up @@ -463,7 +447,7 @@ Base.write(path::AbstractString, dbf::Table) = open(io -> Base.write(io, dbf), t


"Generic .dbf writer for the Tables.jl interface."
write(path::AbstractString, tbl) = open(io -> write(io, tbl), touch(path), "w")
write(path::AbstractString, tbl) = (open(io -> write(io, tbl), touch(path), "w"); path)

function write(io::IO, tbl)
dct = Tables.dictcolumntable(tbl)
Expand Down
177 changes: 79 additions & 98 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,118 +4,99 @@ using Tables
using DataFrames
using Dates

test_dbf_path = joinpath(@__DIR__, "test.dbf")
dbf = DBFTables.Table(test_dbf_path)
df = DataFrame(dbf)
row, st = iterate(dbf)
#-----------------------------------------------------------------------------# setup
df = DataFrame(
a = [1,2,3],
b = ["one", "two", "three"],
c = [true, true, false],
d = [today() + Day(i) for i in 1:3],
e = 1.0:3.0,
f = ["😄", "", "∫eˣ123"]
)

# Same as above but with missings
df2 = vcat(df, DataFrame([missing missing missing missing missing missing], names(df)))

# `df2` as a DBFTables.Table
dbf = DBFTables.Table(DBFTables.write(tempname(), df2))

# `dbf` after write/read roundtrip
dbf2 = DBFTables.Table(DBFTables.write(tempname(), dbf))

# Check that data survives a write/read roundtrip
function roundtrip(t)
path = DBFTables.write(tempname(), t)
t2 = DBFTables.Table(path)
isequal(DataFrame(t), DataFrame(t2))
end

#-----------------------------------------------------------------------------# tests
@testset "DBFTables" begin
@testset "Writing" begin
tables_equal(tbl1, tbl2) = all(zip(Tables.columns(tbl1), Tables.columns(tbl2))) do (t1, t2)
all(ismissing(a) ? ismissing(b) : a == b for (a,b) in zip(t1,t2))
end
function _roundtrip(table)
file = joinpath(tempdir(), "test.dbf")
DBFTables.write(file, table)
table2 = DBFTables.Table(file)
end
roundtrip(table) = tables_equal(DataFrame(table), DataFrame(_roundtrip(table)))
@testset "DBFTables.Table roundtrip" begin
@test Tables.schema(df2) == Tables.schema(dbf)
@test Tables.schema(dbf) == Tables.schema(dbf2)
@test isequal(NamedTuple.(dbf), NamedTuple.(dbf2))

@test DBFTables.isdeleted(dbf2) isa BitVector
@test all(.!DBFTables.isdeleted(dbf2))
@test !DBFTables.isdeleted(dbf2, 3)

@test ismissing(dbf2.a[end])
end

@testset "Tables.jl roundtrips" begin
@test roundtrip(df)
@test roundtrip(df2)
@test roundtrip(dbf)
@test roundtrip([(x=Float32(1), y=1), (x=Float32(2), y=2), (x=missing, y=3)])
@test roundtrip([(x=true, y="test"), (x=missing, y=missing)])
@test roundtrip([(x=today(), y=missing), (x=missing, y=today())])
@test roundtrip([(; x=1.0), (;x=missing)])
@test roundtrip([(; x=1.0), (; x=missing)])
@test roundtrip([(; x=missing), (; x=missing)])

@test_warn "No DBF type" DBFTables.write(tempname(), [(; x = rand(1))])
@test_warn "truncated to 254 characters" DBFTables.write(tempname(), [(; x = rand(999))])

# Base.write for DBFTables.Table
file = joinpath(tempdir(), "test.dbf")
write(file, dbf)
dbf2 = DBFTables.Table(file)
@test tables_equal(dbf, dbf2)
@test_warn "No DBF type associated with Julia type Vector{Float64}" DBFTables.write(tempname(), [(; x = rand(5))])
@test_throws Exception DBFTables.write(tempname(), [(; x = rand(999))])
end

@testset "DataFrame indexing" begin
@test size(df, 1) == 7 # records
@test size(df, 2) == 6 # fields
@test df[2, :CHAR] == "John"
@test df[1, :DATE] == Date("19900102", dateformat"yyyymmdd")
@test df[3, :BOOL] == false
@test df[1, :FLOAT] == 10.21
@test df[2, :NUMERIC] == 12.21
@test df[3, :INTEGER] == 102
end

@testset "missing entries" begin
@test ismissing(df[4, :BOOL])
@test ismissing(df[5, :FLOAT])
@test ismissing(df[6, :NUMERIC])
@test ismissing(df[7, :INTEGER])
end

@testset "header" begin
h = DBFTables.Header(open(test_dbf_path))
@test h.version == 3
@test h.last_update == Date("20140806", dateformat"yyyymmdd")
@test h.records == 7
@test length(h.fields) == 6
@testset "Header" begin
# Verify that Header survives write/read roundtrip
h, h2 = getfield(dbf, :header), getfield(dbf2, :header)
for name in fieldnames(DBFTables.Header)
@test getfield(h, name) == getfield(h2, name)
end
end

@testset "show" begin
@test sprint(show, row) === sprint(show, NamedTuple(row))
# use replace to update to julia 1.4 union printing
@test replace(sprint(show, dbf), r"\} +" => "}") ===
"DBFTables.Table with 7 rows and 6 columns\nTables.Schema:\n :CHAR Union{Missing, String}\n :DATE Union{Missing, Date}\n :BOOL Union{Missing, Bool}\n :FLOAT Union{Missing, Float64}\n :NUMERIC Union{Missing, Float64}\n :INTEGER Union{Missing, $Int}\n"
str = """
DBFTables.Table with 4 rows and 6 columns
Tables.Schema:
:a Union{Missing, $Int}
:b Union{Missing, String}
:c Union{Missing, Bool}
:d Union{Missing, Date}
:e Union{Missing, Float64}
:f Union{Missing, String}
"""
@test sprint(show, dbf) == str
end

@testset "iterate" begin
@test st === 2
@test haskey(row, :CHAR)
@test row.CHAR === "Bob"
@test row[2] === Date("19900102", dateformat"yyyymmdd")
@test_throws ArgumentError row.nonexistent_field
firstrow = (
CHAR = "Bob",
DATE = Date("19900102", dateformat"yyyymmdd"),
BOOL = false,
FLOAT = 10.21,
NUMERIC = 11.21,
INTEGER = 100,
)
@test NamedTuple(row) === firstrow
@test row isa DBFTables.Row
@test row isa Tables.AbstractRow
@test length(row) === 6
@test size(row) === (6,)
@test size(row, 1) === 6
@test_throws BoundsError size(row, 2)
@test DBFTables.getrow(row) === 1
@test DBFTables.gettable(row) === dbf
@test sum(1 for row in dbf) === 7
@test sum(1 for cell in row) === 6
@test propertynames(dbf) == [:CHAR, :DATE, :BOOL, :FLOAT, :NUMERIC, :INTEGER]
@test propertynames(row) == [:CHAR, :DATE, :BOOL, :FLOAT, :NUMERIC, :INTEGER]
end
@testset "iterate and other Base methods" begin
@test size(dbf) == size(df2)
@test size(dbf, 1) == size(df2, 1)
@test size(dbf, 2) == size(df2, 2)
for row in dbf
@test_throws ArgumentError row.nonexistent_field
@test length(row) == length('a':'f')
@test size(row) == (length(row), )
@test size(row, 1) == length(row)
@test propertynames(row) == Symbol.('a':'f')
for prop in propertynames(row)
@test getproperty(row, prop) isa Any # dummy test to ensure no error is thrown
end
end

@testset "column" begin
@test size(dbf) === (7, 6)
@test size(dbf, 2) === 6

@test length(dbf.CHAR) === 7
@test dbf.CHAR isa Vector{Union{String,Missing}}
@test dbf.INTEGER isa Vector{Union{Int,Missing}}
@test_throws ArgumentError row.nonexistent_field
@test dbf.INTEGER[2] === 101
@test ismissing(dbf.INTEGER[7])
@test dbf.CHAR[2] === "John"
@test ismissing(dbf.CHAR[7])

@test DBFTables.isdeleted(dbf) isa BitVector
@test all(.!DBFTables.isdeleted(dbf))
@test !DBFTables.isdeleted(dbf, 3)
@test sum(1 for row in dbf) === 4
@test sum(1 for cell in first(dbf)) === 6
end

@testset "Numeric 20-character Limit Nonsense" begin
Expand All @@ -127,7 +108,7 @@ row, st = iterate(dbf)
@test DBFTables.dbf_value(Val('N'), 0x01, negbig) == string(negbig)
@test_throws Exception DBFTables.dbf_value(Val('N'), 0x01, negbig - 1)

@test_warn r"DBF limitation" DBFTables.dbf_value(Val('N'), 0x01, prevfloat(Inf))
@test_warn r"DBF limitation" DBFTables.dbf_value(Val('N'), 0x01, nextfloat(-Inf))
@test_warn "DBF limitation" DBFTables.dbf_value(Val('N'), 0x01, prevfloat(Inf))
@test_warn "DBF limitation" DBFTables.dbf_value(Val('N'), 0x01, nextfloat(-Inf))
end
end # testset "DBFTables"
end
Binary file removed test/test.dbf
Binary file not shown.

0 comments on commit e5d4c37

Please sign in to comment.