From a68f950d14806d4de7c214727fb7980ebd67192f Mon Sep 17 00:00:00 2001 From: tan Date: Thu, 11 Dec 2014 14:37:47 +0530 Subject: [PATCH] correctly parse all integer types. fixes #9289 Used tryparse methods instead of float64_isvalid. If return array is of Any type, try parsing as Int, Bool and Float64 before falling back to substring. --- base/datafmt.jl | 80 ++++++++++++++++++++++++++++--------------------- base/gmp.jl | 7 ++--- base/string.jl | 57 +++++++++++++++++++++-------------- test/readdlm.jl | 5 ++++ 4 files changed, 88 insertions(+), 61 deletions(-) diff --git a/base/datafmt.jl b/base/datafmt.jl index 39310f4e39337..5d36ed08c7d8d 100644 --- a/base/datafmt.jl +++ b/base/datafmt.jl @@ -3,7 +3,7 @@ module DataFmt importall Base -import Base: _default_delims +import Base: _default_delims, tryparse_internal export countlines, readdlm, readcsv, writedlm, writecsv @@ -137,7 +137,6 @@ type DLMStore{T,S<:ByteString} <: DLMHandler sbuff::S auto::Bool eol::Char - tmp64::Array{Float64,1} end function DLMStore{T,S<:ByteString}(::Type{T}, dims::NTuple{2,Integer}, has_header::Bool, sbuff::S, auto::Bool, eol::Char) @@ -145,7 +144,7 @@ function DLMStore{T,S<:ByteString}(::Type{T}, dims::NTuple{2,Integer}, has_heade nrows <= 0 && throw(ArgumentError("number of rows in dims must be > 0, got $nrows")) ncols <= 0 && throw(ArgumentError("number of columns in dims must be > 0, got $ncols")) hdr_offset = has_header ? 1 : 0 - DLMStore{T,S}(fill(SubString(sbuff,1,0), 1, ncols), Array(T, nrows-hdr_offset, ncols), nrows, ncols, 0, 0, hdr_offset, sbuff, auto, eol, Array(Float64,1)) + DLMStore{T,S}(fill(SubString(sbuff,1,0), 1, ncols), Array(T, nrows-hdr_offset, ncols), nrows, ncols, 0, 0, hdr_offset, sbuff, auto, eol) end _chrinstr(sbuff::ByteString, chr::UInt8, startpos::Int, endpos::Int) = (endpos >= startpos) && (C_NULL != ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), pointer(sbuff.data)+startpos-1, chr, endpos-startpos+1)) @@ -158,7 +157,6 @@ function store_cell{T,S<:ByteString}(dlmstore::DLMStore{T,S}, row::Int, col::Int lastrow = dlmstore.lastrow cells::Array{T,2} = dlmstore.data sbuff::S = dlmstore.sbuff - tmp64 = dlmstore.tmp64 endpos = prevind(sbuff, nextind(sbuff,endpos)) (endpos > 0) && ('\n' == dlmstore.eol) && ('\r' == Char(sbuff[endpos])) && (endpos = prevind(sbuff, endpos)) @@ -189,9 +187,9 @@ function store_cell{T,S<:ByteString}(dlmstore::DLMStore{T,S}, row::Int, col::Int # fill data if quoted && _chrinstr(sbuff, UInt8('"'), startpos, endpos) unescaped = replace(SubString(sbuff,startpos,endpos), r"\"\"", "\"") - fail = colval(unescaped, 1, length(unescaped), cells, drow, col, tmp64) + fail = colval(unescaped, 1, length(unescaped), cells, drow, col) else - fail = colval(sbuff, startpos, endpos, cells, drow, col, tmp64) + fail = colval(sbuff, startpos, endpos, cells, drow, col) end if fail sval = SubString(sbuff,startpos,endpos) @@ -204,9 +202,9 @@ function store_cell{T,S<:ByteString}(dlmstore::DLMStore{T,S}, row::Int, col::Int # fill header if quoted && _chrinstr(sbuff, UInt8('"'), startpos, endpos) unescaped = replace(SubString(sbuff,startpos,endpos), r"\"\"", "\"") - colval(unescaped, 1, length(unescaped), dlmstore.hdr, 1, col, tmp64) + colval(unescaped, 1, length(unescaped), dlmstore.hdr, 1, col) else - colval(sbuff, startpos,endpos, dlmstore.hdr, 1, col, tmp64) + colval(sbuff, startpos,endpos, dlmstore.hdr, 1, col) end end @@ -304,6 +302,8 @@ function dlm_fill(T::DataType, offarr::Vector{Vector{Int}}, dims::NTuple{2,Integ idx = 1 offidx = 1 offsets = offarr[1] + row = 0 + col = 0 try dh = DLMStore(T, dims, has_header, sbuff, auto, eol) while idx <= length(offsets) @@ -320,40 +320,52 @@ function dlm_fill(T::DataType, offarr::Vector{Vector{Int}}, dims::NTuple{2,Integ return result(dh) catch ex isa(ex, TypeError) && (ex.func == :store_cell) && (return dlm_fill(ex.expected, offarr, dims, has_header, sbuff, auto, eol)) - rethrow(ex) + error("at row $row, column $col : $ex") end end - -function colval{T<:Bool, S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{T,2}, row::Int, col::Int, tmp64::Array{Float64,1}) - len = endpos-startpos+1 - if (len == 4) && (0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), pointer(sbuff)+startpos-1, pointer("true"), 4)) - cells[row,col] = true - return false - elseif (len == 5) && (0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), pointer(sbuff)+startpos-1, pointer("false"), 5)) - cells[row,col] = false - return false - end - true +function colval{S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{Bool,2}, row::Int, col::Int) + n = tryparse_internal(Bool, sbuff, startpos, endpos, false) + isnull(n) || (cells[row,col] = get(n)) + isnull(n) end -function colval{T<:Number, S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{T,2}, row::Int, col::Int, tmp64::Array{Float64,1}) - if 0 == ccall(:jl_substrtod, Int32, (Ptr{UInt8},Csize_t,Cint,Ptr{Float64}), sbuff, startpos-1, endpos-startpos+1, tmp64) - cells[row,col] = tmp64[1] - return false - end - true +function colval{T<:Integer, S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{T,2}, row::Int, col::Int) + n = tryparse_internal(T, sbuff, startpos, endpos, 0, false) + isnull(n) || (cells[row,col] = get(n)) + isnull(n) end -colval{T<:AbstractString, S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{T,2}, row::Int, col::Int, tmp64::Array{Float64,1}) = ((cells[row,col] = SubString(sbuff,startpos,endpos)); false) -function colval{S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{Any,2}, row::Int, col::Int, tmp64::Array{Float64,1}) - if 0 == ccall(:jl_substrtod, Int32, (Ptr{UInt8},Csize_t,Cint,Ptr{Float64}), sbuff, startpos-1, endpos-startpos+1, tmp64) - cells[row,col] = tmp64[1] - else - cells[row,col] = SubString(sbuff, startpos, endpos) +function colval{S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{Float64,2}, row::Int, col::Int) + n = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Cint), sbuff, startpos-1, endpos-startpos+1) + isnull(n) || (cells[row,col] = get(n)) + isnull(n) +end +function colval{S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{Float32,2}, row::Int, col::Int) + n = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Cint), sbuff, startpos-1, endpos-startpos+1) + isnull(n) || (cells[row,col] = get(n)) + isnull(n) +end +colval{T<:AbstractString, S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{T,2}, row::Int, col::Int) = ((cells[row,col] = SubString(sbuff,startpos,endpos)); false) +function colval{S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{Any,2}, row::Int, col::Int) + # if array is of Any type, attempt parsing only the most common types: Int, Bool, Float64 and fallback to SubString + len = endpos-startpos+1 + if len > 0 + # check Inteter + ni64 = tryparse_internal(Int, sbuff, startpos, endpos, 0, false) + isnull(ni64) || (cells[row,col] = get(ni64); return false) + + # check Bool + nb = tryparse_internal(Bool, sbuff, startpos, endpos, false) + isnull(nb) || (cells[row,col] = get(nb); return false) + + # check float64 + nf64 = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Cint), sbuff, startpos-1, endpos-startpos+1) + isnull(nf64) || (cells[row,col] = get(nf64); return false) end + cells[row,col] = SubString(sbuff, startpos, endpos) false end -colval{T<:Char, S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{T,2}, row::Int, col::Int, tmp64::Array{Float64,1}) = ((startpos==endpos) ? ((cells[row,col] = next(sbuff,startpos)[1]); false) : true) -colval{S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array, row::Int, col::Int, tmp64::Array{Float64,1}) = true +colval{T<:Char, S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{T,2}, row::Int, col::Int) = ((startpos==endpos) ? ((cells[row,col] = next(sbuff,startpos)[1]); false) : true) +colval{S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array, row::Int, col::Int) = true dlm_parse(s::ASCIIString, eol::Char, dlm::Char, qchar::Char, cchar::Char, ign_adj_dlm::Bool, allow_quote::Bool, allow_comments::Bool, skipstart::Int, skipblanks::Bool, dh::DLMHandler) = begin dlm_parse(s.data, UInt32(eol)%UInt8, UInt32(dlm)%UInt8, UInt32(qchar)%UInt8, UInt32(cchar)%UInt8, diff --git a/base/gmp.jl b/base/gmp.jl index 605ab5dd1539b..f401260499349 100644 --- a/base/gmp.jl +++ b/base/gmp.jl @@ -76,10 +76,9 @@ signed(x::BigInt) = x BigInt(x::BigInt) = x BigInt(s::AbstractString) = parse(BigInt,s) -function tryparse_internal(::Type{BigInt}, s::AbstractString, base::Int, raise::Bool) +function tryparse_internal(::Type{BigInt}, s::AbstractString, startpos::Int, endpos::Int, base::Int, raise::Bool) _n = Nullable{BigInt}() - s = bytestring(s) - sgn, base, i = Base.parseint_preamble(true,s,base) + sgn, base, i = Base.parseint_preamble(true,base,s,startpos,endpos) if i == 0 raise && throw(ArgumentError("premature end of integer: $(repr(s))")) return _n @@ -87,7 +86,7 @@ function tryparse_internal(::Type{BigInt}, s::AbstractString, base::Int, raise:: z = BigInt() err = ccall((:__gmpz_set_str, :libgmp), Int32, (Ptr{BigInt}, Ptr{UInt8}, Int32), - &z, SubString(s,i), base) + &z, SubString(s,i,endpos), base) if err != 0 raise && throw(ArgumentError("invalid BigInt: $(repr(s))")) return _n diff --git a/base/string.jl b/base/string.jl index d85f6a8cd783e..d7c664d21aa1f 100644 --- a/base/string.jl +++ b/base/string.jl @@ -1485,18 +1485,18 @@ function parse{T<:Integer}(::Type{T}, c::Char, base::Integer=36) convert(T, d) end -function parseint_next(s::AbstractString, i::Int=start(s)) - done(s,i) && (return Char(0), 0, 0) - j = i - c, i = next(s,i) - c, i, j +function parseint_next(s::AbstractString, startpos::Int, endpos::Int) + (0 < startpos <= endpos) || (return Char(0), 0, 0) + j = startpos + c, startpos = next(s,startpos) + c, startpos, j end -function parseint_preamble(signed::Bool, s::AbstractString, base::Int) - c, i, j = parseint_next(s) +function parseint_preamble(signed::Bool, base::Int, s::AbstractString, startpos::Int, endpos::Int) + c, i, j = parseint_next(s, startpos, endpos) while isspace(c) - c, i, j = parseint_next(s,i) + c, i, j = parseint_next(s,i,endpos) end (j == 0) && (return 0, 0, 0) @@ -1504,12 +1504,12 @@ function parseint_preamble(signed::Bool, s::AbstractString, base::Int) if signed if c == '-' || c == '+' (c == '-') && (sgn = -1) - c, i, j = parseint_next(s,i) + c, i, j = parseint_next(s,i,endpos) end end while isspace(c) - c, i, j = parseint_next(s,i) + c, i, j = parseint_next(s,i,endpos) end (j == 0) && (return 0, 0, 0) @@ -1518,7 +1518,7 @@ function parseint_preamble(signed::Bool, s::AbstractString, base::Int) c, i = next(s,i) base = c=='b' ? 2 : c=='o' ? 8 : c=='x' ? 16 : 10 if base != 10 - c, i, j = parseint_next(s,i) + c, i, j = parseint_next(s,i,endpos) end else base = 10 @@ -1527,21 +1527,30 @@ function parseint_preamble(signed::Bool, s::AbstractString, base::Int) return sgn, base, j end +function tryparse_internal{S<:ByteString}(::Type{Bool}, sbuff::S, startpos::Int, endpos::Int, raise::Bool) + len = endpos-startpos+1 + p = pointer(sbuff)+startpos-1 + (len == 4) && (0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), p, "true", 4)) && (return Nullable(true)) + (len == 5) && (0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), p, "false", 5)) && (return Nullable(false)) + raise && throw(ArgumentError("invalid Bool representation: $(repr(SubString(s,startpos,endpos)))")) + Nullable{Bool}() +end + safe_add{T<:Integer}(n1::T, n2::T) = ((n2 > 0) ? (n1 > (typemax(T) - n2)) : (n1 < (typemin(T) - n2))) ? Nullable{T}() : Nullable{T}(n1 + n2) safe_mul{T<:Integer}(n1::T, n2::T) = ((n2 > 0) ? ((n1 > div(typemax(T),n2)) || (n1 < div(typemin(T),n2))) : (n2 < -1) ? ((n1 > div(typemin(T),n2)) || (n1 < div(typemax(T),n2))) : ((n2 == -1) && n1 == typemin(T))) ? Nullable{T}() : Nullable{T}(n1 * n2) -function tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, base::Int, a::Int, raise::Bool) +function tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, startpos::Int, endpos::Int, base::Int, a::Int, raise::Bool) _n = Nullable{T}() - sgn, base, i = parseint_preamble(T<:Signed,s,base) + sgn, base, i = parseint_preamble(T<:Signed, base, s, startpos, endpos) if i == 0 - raise && throw(ArgumentError("premature end of integer: $(repr(s))")) + raise && throw(ArgumentError("premature end of integer: $(repr(SubString(s,startpos,endpos)))")) return _n end - c, i = parseint_next(s,i) + c, i = parseint_next(s,i,endpos) if i == 0 - raise && throw(ArgumentError("premature end of integer: $(repr(s))")) + raise && throw(ArgumentError("premature end of integer: $(repr(SubString(s,startpos,endpos)))")) return _n end @@ -1553,12 +1562,12 @@ function tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, base::Int, 'A' <= c <= 'Z' ? c-'A'+10 : 'a' <= c <= 'z' ? c-'a'+a : base if d >= base - raise && throw(ArgumentError("invalid base $base digit $(repr(c)) in $(repr(s))")) + raise && throw(ArgumentError("invalid base $base digit $(repr(c)) in $(repr(SubString(s,startpos,endpos)))")) return _n end n *= base n += d - if done(s,i) + if i > endpos n *= sgn return Nullable{T}(n) end @@ -1571,7 +1580,7 @@ function tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, base::Int, 'A' <= c <= 'Z' ? c-'A'+10 : 'a' <= c <= 'z' ? c-'a'+a : base if d >= base - raise && throw(ArgumentError("invalid base $base digit $(repr(c)) in $(repr(s))")) + raise && throw(ArgumentError("invalid base $base digit $(repr(c)) in $(repr(SubString(s,startpos,endpos)))")) return _n end (T <: Signed) && (d *= sgn) @@ -1583,20 +1592,22 @@ function tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, base::Int, return _n end n = get(safe_n) - done(s,i) && return Nullable{T}(n) + (i > endpos) && return Nullable{T}(n) c, i = next(s,i) end - while !done(s,i) + while i <= endpos c, i = next(s,i) if !isspace(c) - raise && throw(ArgumentError("extra characters after whitespace in $(repr(s))")) + raise && throw(ArgumentError("extra characters after whitespace in $(repr(SubString(s,startpos,endpos)))")) return _n end end return Nullable{T}(n) end tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, base::Int, raise::Bool) = - tryparse_internal(T, s, base, base <= 36 ? 10 : 36, raise) + tryparse_internal(T,s,start(s),length(s),base,raise) +tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, startpos::Int, endpos::Int, base::Int, raise::Bool) = + tryparse_internal(T, s, startpos, endpos, base, base <= 36 ? 10 : 36, raise) tryparse{T<:Integer}(::Type{T}, s::AbstractString, base::Int) = 2 <= base <= 62 ? tryparse_internal(T,s,Int(base),false) : throw(ArgumentError("invalid base: base must be 2 ≤ base ≤ 62, got $base")) tryparse{T<:Integer}(::Type{T}, s::AbstractString) = tryparse_internal(T,s,0,false) diff --git a/test/readdlm.jl b/test/readdlm.jl index beb25960a82b6..cf5247fcdb5e0 100644 --- a/test/readdlm.jl +++ b/test/readdlm.jl @@ -197,3 +197,8 @@ let i18n_data = ["Origin (English)", "Name (English)", "Origin (Native)", "Name writedlm(i18n_buff, i18n_arr, '\t') @test (data, hdr) == readdlm(i18n_buff, '\t', header=true) end + +@test isequaldlm(readcsv(IOBuffer("1,22222222222222222222222222222222222222,0x3,10e6\n2000.1,true,false,-10.34"), Any), + reshape(Any[1,2000.1,Float64(22222222222222222222222222222222222222),true,0x3,false,10e6,-10.34], 2, 4), Any) + +@test isequaldlm(readcsv(IOBuffer("-9223355253176920979,9223355253176920979"), Int64), Int64[-9223355253176920979 9223355253176920979], Int64)