From e5d4c37b1bac77ea37d3ee0527193043c0f591d0 Mon Sep 17 00:00:00 2001 From: Josh Day Date: Tue, 18 Jun 2024 14:39:20 -0400 Subject: [PATCH] (Mostly) Bug Fixes for writing Strings (#35) - Fix String write for when `length(str) != ncodeunits(str)` - `write(path::String, table)` now returns `path` instead of number of bytes written. - Support for writing `Char` - Attempting to write a String >254 bytes is now an error (previously a warning) - Change tests to depend on tables created within the tests as opposed to a .dbf file. - Added quirks/gotchas to README --- Project.toml | 2 +- README.md | 14 ++++ src/DBFTables.jl | 54 +++++---------- test/runtests.jl | 177 +++++++++++++++++++++-------------------------- test/test.dbf | Bin 611 -> 0 bytes 5 files changed, 113 insertions(+), 134 deletions(-) delete mode 100644 test/test.dbf diff --git a/Project.toml b/Project.toml index b436519..67e2df3 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "DBFTables" uuid = "75c7ada1-017a-5fb6-b8c7-2125ff2d6c93" -version = "1.2.5" +version = "1.2.6" [deps] Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" diff --git a/README.md b/README.md index 75497de..6116c75 100644 --- a/README.md +++ b/README.md @@ -49,3 +49,17 @@ The DBF header contains information on the amount of rows, which columns are pre The `DBFTables.Table` struct holds both the header and data. All data is read into memory in one go as a `Vector{UInt8}`. To provide efficient access into the individual entries, we use [WeakRefStrings](https://github.com/JuliaData/WeakRefStrings.jl/). WeakRefStrings' `StringArray` only holds the offsets and lengths into the `Vector{UInt8}` with all the data. Then we still need to convert from the string to the julia type. This is done on demand with `dbf_value`. Note that the format also contains a "record deleted" flag, which is represented by a `'*'` at the start of the row. When this is encountered the record should be treated as if it doesn't exist. Since normally writers strip these records when writing, they are rarely encountered. For that reason this package ignores these flags by default right now. To check for the flags yourself, there is the `isdeleted` function. A sample file with deleted record flags is available [here](https://issues.qgis.org/issues/11007#note-30). + + +## Quirks and Gotchas + +The DBF format is quite old (introduced in 1983). As such, it has some quirks that may not be immediately obvious: + +1. An empty string is equivalent to a missing value. Thus an empty string in a table will not survive a `write`/`read` round trip. +2. Strings are limited to 254 characters. Attempting to write longer Strings results in an error. +3. In order to support as many versions of DBF as possible, DBFTables.jl will only write data as one of the following DBF data types: + - `'C'` (Character): `String`s (and anything else that doesn't doesn't match one of the other three types). + - `'N'` (Numeric): `Integer`s and `AbstractFloat`s. + - `'L'` (Logical): `Bool`s. + - `'D'` (Date): `Date`s. +4. The `'N` (Numeric) data type restricts values to fit within 20 printed characters. All `Int64`s fit within 20 characters, but `Float64`s may not. E.g. `string(nextfloat(-Inf))` is 23 characters. DBFTables.jl will remove the least significant digits (loss of precision) in order to fit within the 20 character limit. diff --git a/src/DBFTables.jl b/src/DBFTables.jl index 841f4de..8146747 100644 --- a/src/DBFTables.jl +++ b/src/DBFTables.jl @@ -24,13 +24,9 @@ function FieldDescriptor(name::Symbol, data::AbstractVector) elseif T === Union{} # data is only missings len = 0x01 elseif char === 'C' - width = T <: AbstractString ? maximum(length, itr) : maximum(x -> length(string(x)), itr) - if width > 254 - @warn "Due to DBF limitations, strings in field $name will be truncated to 254 characters." - len = UInt8(254) - else - len = UInt8(width) - end + width = maximum(x -> ncodeunits(string(x)), itr) + width > 254 && error("String data must be <254 characters due to DBF limitations. Found: $width.") + len = UInt8(width) elseif char === 'N' len = UInt8(20) ndec = T <: AbstractFloat ? 0x1 : 0x0 @@ -73,7 +69,7 @@ end #-----------------------------------------------------------------------# conversions: Julia-to-DBF # These are the only types DBFTables.jl will use to save data as. "Get the DBF type code from the Julia type. Assumes `Base.nonmissingtype(T)` is the input." -dbf_type(::Type{<:AbstractString}) = 'C' +dbf_type(::Type{<:Union{Char, AbstractString}}) = 'C' dbf_type(::Type{Bool}) = 'L' dbf_type(::Type{<:Integer}) = 'N' dbf_type(::Type{<:AbstractFloat}) = 'N' @@ -88,8 +84,9 @@ dbf_value(field::FieldDescriptor, val) = dbf_value(Val(field.dbf_type), field.le # String (or any other type that gets mapped to 'C') function dbf_value(::Val{'C'}, len::UInt8, x) - out = replace(rpad(x, len), !isascii => x -> '_' ^ textwidth(x)) - length(out) > 254 ? out[1:254] : out + s = string(x) + out = s * ' ' ^ (len - ncodeunits(s)) + ncodeunits(out) > 254 ? error("The DBF format cannot save strings >254 characters.") : out end dbf_value(::Val{'C'}, len::UInt8, ::Missing) = ' ' ^ len @@ -111,7 +108,7 @@ function dbf_value(::Val{'N'}, ::UInt8, x::AbstractFloat) s2 = @sprintf "%.20e" x i = findfirst('e', s2) s_end = replace(s2[i:end], '+' => "") - len = length(s_end) + len = length(s_end) n = 20 - len out = s2[1:n] * s_end @warn "A DBF limitation has reduced the precision of $x by $(length(s) - 20) digits." @@ -145,8 +142,8 @@ end julia_value(o::FieldDescriptor, s::AbstractString) = julia_value(o.type, Val(o.dbf_type), s::AbstractString) function julia_value_string(s::AbstractString) - s2 = strip(s) - isempty(s2) ? missing : String(s2) + out = strip(x -> isspace(x) || x == '\0', s) + isempty(out) ? missing : out end julia_value(::Type{String}, ::Val{'C'}, s::AbstractString) = julia_value_string(s) @@ -192,19 +189,14 @@ end "Read a field descriptor from the stream, and create a FieldDescriptor struct" function read_dbf_field(io::IO) - n_bytes_field_name = 11 # field name can be up to 11 bytes long, delimited by '\0' (end of string, EOS) - field_name_bytes = read(io, n_bytes_field_name) - pos_eos = findfirst(iszero, field_name_bytes) - n = pos_eos === nothing ? n_bytes_field_name : pos_eos - 1 - field_name = Symbol(field_name_bytes[1:n]) - - field_type = read(io, Char) + name = Symbol(filter!(!iszero, read(io, 11))) # 11 bytes padded by '\0' + dbf_type = read(io, Char) skip(io, 4) # skip - field_len = read(io, UInt8) - field_dec = read(io, UInt8) + length = read(io, UInt8) + ndec = read(io, UInt8) skip(io, 14) # reserved - jltype = julia_type(Val(field_type), field_dec) - return FieldDescriptor(field_name, jltype, field_type, field_len, field_dec) + type = julia_type(Val(dbf_type), ndec) + return FieldDescriptor(name, type, dbf_type, length, ndec) end reserved(n) = fill(0x00, n) @@ -247,15 +239,7 @@ function Header(io::IO) fieldcolumns[field.name] = col push!(fields, field) col += 1 - - # peek if we are at the end - mark(io) - trm = read(io, UInt8) - if trm == 0xD - break - else - reset(io) - end + peek(io) == 0x0d && break end return Header( @@ -332,7 +316,7 @@ function Table(io::IO) # Make sure data is read at the right position bytes_to_skip = header.hsize - position(io) bytes_to_skip > 0 && skip(io, bytes_to_skip) - + data = Vector{UInt8}(undef, header.rsize * header.records) read!(io, data) strings = _create_stringarray(header, data) @@ -463,7 +447,7 @@ Base.write(path::AbstractString, dbf::Table) = open(io -> Base.write(io, dbf), t "Generic .dbf writer for the Tables.jl interface." -write(path::AbstractString, tbl) = open(io -> write(io, tbl), touch(path), "w") +write(path::AbstractString, tbl) = (open(io -> write(io, tbl), touch(path), "w"); path) function write(io::IO, tbl) dct = Tables.dictcolumntable(tbl) diff --git a/test/runtests.jl b/test/runtests.jl index 5b09242..0c3f407 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,118 +4,99 @@ using Tables using DataFrames using Dates -test_dbf_path = joinpath(@__DIR__, "test.dbf") -dbf = DBFTables.Table(test_dbf_path) -df = DataFrame(dbf) -row, st = iterate(dbf) +#-----------------------------------------------------------------------------# setup +df = DataFrame( + a = [1,2,3], + b = ["one", "two", "three"], + c = [true, true, false], + d = [today() + Day(i) for i in 1:3], + e = 1.0:3.0, + f = ["😄", "∱", "∫eˣ123"] +) +# Same as above but with missings +df2 = vcat(df, DataFrame([missing missing missing missing missing missing], names(df))) + +# `df2` as a DBFTables.Table +dbf = DBFTables.Table(DBFTables.write(tempname(), df2)) + +# `dbf` after write/read roundtrip +dbf2 = DBFTables.Table(DBFTables.write(tempname(), dbf)) + +# Check that data survives a write/read roundtrip +function roundtrip(t) + path = DBFTables.write(tempname(), t) + t2 = DBFTables.Table(path) + isequal(DataFrame(t), DataFrame(t2)) +end + +#-----------------------------------------------------------------------------# tests @testset "DBFTables" begin - @testset "Writing" begin - tables_equal(tbl1, tbl2) = all(zip(Tables.columns(tbl1), Tables.columns(tbl2))) do (t1, t2) - all(ismissing(a) ? ismissing(b) : a == b for (a,b) in zip(t1,t2)) - end - function _roundtrip(table) - file = joinpath(tempdir(), "test.dbf") - DBFTables.write(file, table) - table2 = DBFTables.Table(file) - end - roundtrip(table) = tables_equal(DataFrame(table), DataFrame(_roundtrip(table))) + @testset "DBFTables.Table roundtrip" begin + @test Tables.schema(df2) == Tables.schema(dbf) + @test Tables.schema(dbf) == Tables.schema(dbf2) + @test isequal(NamedTuple.(dbf), NamedTuple.(dbf2)) + + @test DBFTables.isdeleted(dbf2) isa BitVector + @test all(.!DBFTables.isdeleted(dbf2)) + @test !DBFTables.isdeleted(dbf2, 3) + + @test ismissing(dbf2.a[end]) + end + + @testset "Tables.jl roundtrips" begin @test roundtrip(df) + @test roundtrip(df2) @test roundtrip(dbf) @test roundtrip([(x=Float32(1), y=1), (x=Float32(2), y=2), (x=missing, y=3)]) @test roundtrip([(x=true, y="test"), (x=missing, y=missing)]) @test roundtrip([(x=today(), y=missing), (x=missing, y=today())]) - @test roundtrip([(; x=1.0), (;x=missing)]) + @test roundtrip([(; x=1.0), (; x=missing)]) @test roundtrip([(; x=missing), (; x=missing)]) - @test_warn "No DBF type" DBFTables.write(tempname(), [(; x = rand(1))]) - @test_warn "truncated to 254 characters" DBFTables.write(tempname(), [(; x = rand(999))]) - - # Base.write for DBFTables.Table - file = joinpath(tempdir(), "test.dbf") - write(file, dbf) - dbf2 = DBFTables.Table(file) - @test tables_equal(dbf, dbf2) + @test_warn "No DBF type associated with Julia type Vector{Float64}" DBFTables.write(tempname(), [(; x = rand(5))]) + @test_throws Exception DBFTables.write(tempname(), [(; x = rand(999))]) end - @testset "DataFrame indexing" begin - @test size(df, 1) == 7 # records - @test size(df, 2) == 6 # fields - @test df[2, :CHAR] == "John" - @test df[1, :DATE] == Date("19900102", dateformat"yyyymmdd") - @test df[3, :BOOL] == false - @test df[1, :FLOAT] == 10.21 - @test df[2, :NUMERIC] == 12.21 - @test df[3, :INTEGER] == 102 - end - - @testset "missing entries" begin - @test ismissing(df[4, :BOOL]) - @test ismissing(df[5, :FLOAT]) - @test ismissing(df[6, :NUMERIC]) - @test ismissing(df[7, :INTEGER]) - end - - @testset "header" begin - h = DBFTables.Header(open(test_dbf_path)) - @test h.version == 3 - @test h.last_update == Date("20140806", dateformat"yyyymmdd") - @test h.records == 7 - @test length(h.fields) == 6 + @testset "Header" begin + # Verify that Header survives write/read roundtrip + h, h2 = getfield(dbf, :header), getfield(dbf2, :header) + for name in fieldnames(DBFTables.Header) + @test getfield(h, name) == getfield(h2, name) + end end @testset "show" begin - @test sprint(show, row) === sprint(show, NamedTuple(row)) - # use replace to update to julia 1.4 union printing - @test replace(sprint(show, dbf), r"\} +" => "}") === - "DBFTables.Table with 7 rows and 6 columns\nTables.Schema:\n :CHAR Union{Missing, String}\n :DATE Union{Missing, Date}\n :BOOL Union{Missing, Bool}\n :FLOAT Union{Missing, Float64}\n :NUMERIC Union{Missing, Float64}\n :INTEGER Union{Missing, $Int}\n" + str = """ + DBFTables.Table with 4 rows and 6 columns + Tables.Schema: + :a Union{Missing, $Int} + :b Union{Missing, String} + :c Union{Missing, Bool} + :d Union{Missing, Date} + :e Union{Missing, Float64} + :f Union{Missing, String} + """ + @test sprint(show, dbf) == str end - @testset "iterate" begin - @test st === 2 - @test haskey(row, :CHAR) - @test row.CHAR === "Bob" - @test row[2] === Date("19900102", dateformat"yyyymmdd") - @test_throws ArgumentError row.nonexistent_field - firstrow = ( - CHAR = "Bob", - DATE = Date("19900102", dateformat"yyyymmdd"), - BOOL = false, - FLOAT = 10.21, - NUMERIC = 11.21, - INTEGER = 100, - ) - @test NamedTuple(row) === firstrow - @test row isa DBFTables.Row - @test row isa Tables.AbstractRow - @test length(row) === 6 - @test size(row) === (6,) - @test size(row, 1) === 6 - @test_throws BoundsError size(row, 2) - @test DBFTables.getrow(row) === 1 - @test DBFTables.gettable(row) === dbf - @test sum(1 for row in dbf) === 7 - @test sum(1 for cell in row) === 6 - @test propertynames(dbf) == [:CHAR, :DATE, :BOOL, :FLOAT, :NUMERIC, :INTEGER] - @test propertynames(row) == [:CHAR, :DATE, :BOOL, :FLOAT, :NUMERIC, :INTEGER] - end + @testset "iterate and other Base methods" begin + @test size(dbf) == size(df2) + @test size(dbf, 1) == size(df2, 1) + @test size(dbf, 2) == size(df2, 2) + for row in dbf + @test_throws ArgumentError row.nonexistent_field + @test length(row) == length('a':'f') + @test size(row) == (length(row), ) + @test size(row, 1) == length(row) + @test propertynames(row) == Symbol.('a':'f') + for prop in propertynames(row) + @test getproperty(row, prop) isa Any # dummy test to ensure no error is thrown + end + end - @testset "column" begin - @test size(dbf) === (7, 6) - @test size(dbf, 2) === 6 - - @test length(dbf.CHAR) === 7 - @test dbf.CHAR isa Vector{Union{String,Missing}} - @test dbf.INTEGER isa Vector{Union{Int,Missing}} - @test_throws ArgumentError row.nonexistent_field - @test dbf.INTEGER[2] === 101 - @test ismissing(dbf.INTEGER[7]) - @test dbf.CHAR[2] === "John" - @test ismissing(dbf.CHAR[7]) - - @test DBFTables.isdeleted(dbf) isa BitVector - @test all(.!DBFTables.isdeleted(dbf)) - @test !DBFTables.isdeleted(dbf, 3) + @test sum(1 for row in dbf) === 4 + @test sum(1 for cell in first(dbf)) === 6 end @testset "Numeric 20-character Limit Nonsense" begin @@ -127,7 +108,7 @@ row, st = iterate(dbf) @test DBFTables.dbf_value(Val('N'), 0x01, negbig) == string(negbig) @test_throws Exception DBFTables.dbf_value(Val('N'), 0x01, negbig - 1) - @test_warn r"DBF limitation" DBFTables.dbf_value(Val('N'), 0x01, prevfloat(Inf)) - @test_warn r"DBF limitation" DBFTables.dbf_value(Val('N'), 0x01, nextfloat(-Inf)) + @test_warn "DBF limitation" DBFTables.dbf_value(Val('N'), 0x01, prevfloat(Inf)) + @test_warn "DBF limitation" DBFTables.dbf_value(Val('N'), 0x01, nextfloat(-Inf)) end -end # testset "DBFTables" +end diff --git a/test/test.dbf b/test/test.dbf deleted file mode 100644 index 30ab466fffafccb4f810fe633dcbccf56ab6e004..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 611 zcmaKm%L>9U5JjVciXggFR{?i|WhPZiR?^ni2ek#M3%9Nm#Dah9pO`c?#z#GicDQ#= zlG?3VZxBN6#1Y?b{FS>aQokaG&d18aO@ks@bj8s8=)EL~iYV%dL7rg}Mv0rMAnc2O zt7^7{#QSZqTl+GMhd=stoCeDve_}kAz1%ctQ!hJG1LPb4frWX7U@=q^J|{{MfNrwW z)zDZjft|`s^cLqzW2T1GJ6|qun%K2rqHd#LiIXL0n;J6=RdYw)hjaK#nRXayP{TB( Tnyu0PXrd|Arka`NwLEwLUbjy=