Skip to content

Commit b59e383

Browse files
authored
Merge pull request #108 from davidanthoff/escaped-strings
Fix escaped strings handling
2 parents d1e69b9 + b87ae23 commit b59e383

File tree

7 files changed

+200
-55
lines changed

7 files changed

+200
-55
lines changed

src/csv.jl

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -44,12 +44,13 @@ end
4444

4545
optionsiter(opts::AbstractVector, header) = optionsiter(opts)
4646

47-
tofield(f::AbstractField, opts) = f
48-
tofield(f::AbstractToken, opts) = Field(f)
49-
tofield(f::StringToken, opts) = Field(Quoted(f, opts.quotechar, opts.escapechar))
50-
tofield(f::Type, opts) = tofield(fromtype(f), opts)
51-
tofield(f::Type{String}, opts) = tofield(fromtype(StrRange), opts)
52-
tofield(f::DateFormat, opts) = tofield(DateTimeToken(DateTime, f), opts)
47+
tofield(f::AbstractField, opts, stringarraytype) = f
48+
tofield(f::AbstractToken, opts, stringarraytype) = Field(f)
49+
tofield(f::StringToken, opts, stringarraytype) = Field(Quoted(f, opts.quotechar, opts.escapechar))
50+
tofield(f::Type, opts, stringarraytype) = tofield(fromtype(f), opts, stringarraytype)
51+
tofield(f::Type{String}, opts, stringarraytype::Type{StringArray}) = tofield(fromtype(StrRange), opts, stringarraytype)
52+
tofield(f::Type{String}, opts, stringarraytype::Type{Array}) = tofield(fromtype(String), opts, stringarraytype)
53+
tofield(f::DateFormat, opts, stringarraytype) = tofield(DateTimeToken(DateTime, f), opts, stringarraytype)
5354

5455
"""
5556
csvread(file::Union{String,IO}, delim=','; <arguments>...)
@@ -241,9 +242,8 @@ function _csvread_internal(str::AbstractString, delim=',';
241242

242243
# seed guesses using those from previous file
243244
guess, pos1 = guesscolparsers(str, canonnames, opts,
244-
pos, type_detect_rows,
245-
colparsers, commentchar,
246-
nastrings, prev_parsers)
245+
pos, type_detect_rows, colparsers, stringarraytype,
246+
commentchar, nastrings, prev_parsers)
247247
if isempty(canonnames)
248248
canonnames = Any[1:length(guess);]
249249
end
@@ -255,7 +255,7 @@ function _csvread_internal(str::AbstractString, delim=',';
255255
if !(fieldtype(v) <: StringLike) && prev_parsers !== nothing && !haskey(colspool, c)
256256
v = isa(v, NAToken) ? v : NAToken(v)
257257
end
258-
p = tofield(v, opts)
258+
p = tofield(v, opts, stringarraytype)
259259
guess[i] = p
260260
end
261261

@@ -276,7 +276,7 @@ function _csvread_internal(str::AbstractString, delim=',';
276276
current_record[] = rec
277277

278278
if nrows == 0
279-
# just an estimate, with some margin
279+
# just an estimate, with some margin
280280
nrows = ceil(Int, (len-pos) / ((pos1-pos)/max(1, type_detect_rows)) * sqrt(2))
281281
end
282282

@@ -417,7 +417,7 @@ function _csvread_internal(str::AbstractString, delim=',';
417417
end
418418

419419
function promote_field(failed_str, field, col, err, nastrings, stringtype, stringarraytype, opts)
420-
newtoken = guesstoken(failed_str, opts, field.inner, nastrings)
420+
newtoken = guesstoken(failed_str, opts, field.inner, nastrings, stringarraytype)
421421
if newtoken == field.inner
422422
# no need to change
423423
return field, col
@@ -478,8 +478,7 @@ function readcolnames(str, opts, pos, colnames)
478478
end
479479

480480
function guesscolparsers(str::AbstractString, header, opts::LocalOpts, pos::Int,
481-
nrows::Int, colparsers, commentchar=nothing, nastrings=NA_STRINGS,
482-
prevs=nothing)
481+
nrows::Int, colparsers, stringarraytype, commentchar=nothing, nastrings=NA_STRINGS, prevs=nothing)
483482
# Field type guesses
484483
guess = []
485484
prevfields = String[]
@@ -517,7 +516,7 @@ function guesscolparsers(str::AbstractString, header, opts::LocalOpts, pos::Int,
517516
error("previous rows had $(length(guess)) fields but row $i2 has $(length(fields))")
518517
end
519518
try
520-
guess[j] = guesstoken(fields[j], opts, guess[j], nastrings)
519+
guess[j] = guesstoken(fields[j], opts, guess[j], nastrings, stringarraytype)
521520
catch err
522521
println(stderr, "Error while guessing a common type for column $j")
523522
println(stderr, "new value: $(fields[j]), prev guess was: $(guess[j])")
@@ -534,7 +533,7 @@ function guesscolparsers(str::AbstractString, header, opts::LocalOpts, pos::Int,
534533

535534
# override guesses with user request
536535
for (i, v) in optionsiter(colparsers, header)
537-
guess[i] = tofield(v, opts)
536+
guess[i] = tofield(v, opts, stringarraytype)
538537
end
539538
guess, pos
540539
end
@@ -662,7 +661,7 @@ end
662661
function quotedsplit(str, opts, includequotes, i=firstindex(str), l=lastindex(str))
663662
strtok = Quoted(StringToken(String), opts.quotechar, opts.escapechar, required=false,
664663
includequotes=includequotes)
665-
664+
666665
f = Field(strtok, eoldelim=true)
667666
strs = String[]
668667
if l == 0

src/field.jl

Lines changed: 47 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -146,10 +146,10 @@ const pre_comp_exp_double = Double64[Double64(10.0)^i for i=0:308]
146146
f = Float64(f1)
147147
r = f1 - Int64(f) # get the remainder
148148
x = Double64(f) + Double64(r)
149-
149+
150150
maxexp = 308
151151
minexp = -256
152-
152+
153153
if exp >= 0
154154
x *= pre_comp_exp_double[exp+1]
155155
else
@@ -211,7 +211,7 @@ end
211211
y3 = iterate(str, i)
212212
if y3!==nothing && _is_e(str, i)
213213
i = y3[2]
214-
214+
215215
y4 = iterate(str, i)
216216
if y4!==nothing
217217
enegate = false
@@ -266,7 +266,7 @@ function tryparsenext(::Percentage, str, i, len, opts)
266266
# parse away the % char
267267
ii = eatwhitespaces(str, ii, len)
268268
y = iterate(str, ii)
269-
if y===nothing
269+
if y===nothing
270270
return Nullable{Float64}(), ii # failed to parse %
271271
else
272272
c = y[1]; k = y[2]
@@ -295,6 +295,8 @@ show(io::IO, c::StringToken) = print(io, "<string>")
295295
fromtype(::Type{S}) where {S<:AbstractString} = StringToken(S)
296296

297297
function tryparsenext(s::StringToken{T}, str, i, len, opts) where {T}
298+
inside_quoted_strong = Char(opts.endchar) == Char(opts.quotechar)
299+
escapecount = 0
298300
R = Nullable{T}
299301
p = ' '
300302
i0 = i
@@ -312,10 +314,15 @@ function tryparsenext(s::StringToken{T}, str, i, len, opts) where {T}
312314
y2 = iterate(str, i)
313315
while y2!==nothing
314316
c = y2[1]; ii = y2[2]
317+
318+
if inside_quoted_strong && p==Char(opts.escapechar)
319+
escapecount += 1
320+
end
321+
315322
if opts.spacedelim && (c == ' ' || c == '\t')
316323
break
317324
elseif !opts.spacedelim && c == Char(opts.endchar)
318-
if Char(opts.endchar) == Char(opts.quotechar)
325+
if inside_quoted_strong
319326
# this means we're inside a quoted string
320327
if Char(opts.quotechar) == Char(opts.escapechar)
321328
# sometimes the quotechar is the escapechar
@@ -358,14 +365,41 @@ function tryparsenext(s::StringToken{T}, str, i, len, opts) where {T}
358365
y2 = iterate(str, i)
359366
end
360367

361-
return R(_substring(T, str, i0, i-1)), i
368+
return R(_substring(T, str, i0, i-1, escapecount, opts)), i
362369
end
363370

364-
@inline function _substring(::Type{String}, str, i, j)
365-
String(str[i:thisind(str, j)])
371+
@inline function _substring(::Type{String}, str, i, j, escapecount, opts)
372+
if escapecount > 0
373+
buf = IOBuffer(sizehint=j-i+1-escapecount)
374+
cur_i = i
375+
c = str[cur_i]
376+
if opts.includequotes && c==Char(opts.quotechar)
377+
print(buf, c)
378+
cur_i = nextind(str, cur_i)
379+
end
380+
while cur_i <= j
381+
c = str[cur_i]
382+
if c == Char(opts.escapechar)
383+
next_i = nextind(str, cur_i)
384+
if next_i <= j && str[next_i] == Char(opts.quotechar)
385+
print(buf, str[next_i])
386+
cur_i = next_i
387+
else
388+
print(buf, c)
389+
end
390+
else
391+
print(buf, c)
392+
end
393+
cur_i = nextind(str, cur_i)
394+
end
395+
return String(take!(buf))
396+
else
397+
return unsafe_string(pointer(str, i), j-i+1)
398+
end
366399
end
367400

368-
@inline function _substring(::Type{T}, str, i, j) where {T<:SubString}
401+
@inline function _substring(::Type{T}, str, i, j, escapecount, opts) where {T<:SubString}
402+
escapecount > 0 && error("Not yet handled.")
369403
T(str, i, thisind(j))
370404
end
371405

@@ -375,11 +409,12 @@ fromtype(::Type{StrRange}) = StringToken(StrRange)
375409
unsafe_string(pointer(str, 1 + r.offset), r.length)
376410
end
377411

378-
@inline function _substring(::Type{StrRange}, str, i, j)
379-
StrRange(i - 1, j - i + 1)
412+
@inline function _substring(::Type{StrRange}, str, i, j, escapecount, opts)
413+
StrRange(i - 1, j - i + 1, escapecount)
380414
end
381415

382-
@inline function _substring(::Type{<:WeakRefString}, str, i, j)
416+
@inline function _substring(::Type{<:WeakRefString}, str, i, j, escapecount, opts)
417+
escapecount > 0 && error("Not yet handled.")
383418
WeakRefString(convert(Ptr{UInt8}, pointer(str, i)), j - i + 1)
384419
end
385420

src/guesstype.jl

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ function getquotechar(x)
5252
return '\0'
5353
end
5454

55-
function guesstoken(x, opts, @nospecialize(prev_guess)=Unknown(), nastrings=NA_STRINGS)
55+
function guesstoken(x, opts, @nospecialize(prev_guess)=Unknown(), nastrings=NA_STRINGS, stringarraytype=StringArray)
5656
q = getquotechar(x)
5757

5858
if isa(prev_guess, StringToken)
@@ -65,18 +65,18 @@ function guesstoken(x, opts, @nospecialize(prev_guess)=Unknown(), nastrings=NA_S
6565
else
6666
prev_inner = prev_guess
6767
end
68-
inner_token = guesstoken(strip(strip(x, q)), opts, prev_inner, nastrings)
68+
inner_token = guesstoken(strip(strip(x, q)), opts, prev_inner, nastrings, stringarraytype)
6969
return Quoted(inner_token, opts.quotechar, opts.escapechar)
7070
elseif isa(prev_guess, Quoted)
7171
# but this token is not quoted
72-
return Quoted(guesstoken(x, opts, prev_guess.inner, nastrings), opts.quotechar, opts.escapechar)
72+
return Quoted(guesstoken(x, opts, prev_guess.inner, nastrings, stringarraytype), opts.quotechar, opts.escapechar)
7373
elseif isa(prev_guess, NAToken)
7474
# This column is nullable
7575
if isna(x, nastrings)
7676
# x is null too, return previous guess
7777
return prev_guess
7878
else
79-
tok = guesstoken(x, opts, prev_guess.inner, nastrings)
79+
tok = guesstoken(x, opts, prev_guess.inner, nastrings, stringarraytype)
8080
if isa(tok, StringToken)
8181
return tok # never wrap a string in NAToken
8282
elseif isa(tok, Quoted)
@@ -108,17 +108,17 @@ function guesstoken(x, opts, @nospecialize(prev_guess)=Unknown(), nastrings=NA_S
108108
return Numeric(promote_type(T, fieldtype(prev_guess)))
109109
else
110110
# something like a date turned into a single number?
111-
return StringToken(StrRange)
111+
return StringToken(stringarraytype<:StringArray ? StrRange : String)
112112
end
113113
else
114114
# fast-path
115115
if length(filter(isnumeric, x)) < 4
116-
return StringToken(StrRange)
116+
return StringToken(stringarraytype<:StringArray ? StrRange : String)
117117
end
118118

119119
maybedate = guessdateformat(x)
120120
if maybedate === nothing
121-
return StringToken(StrRange)
121+
return StringToken(stringarraytype<:StringArray ? StrRange : String)
122122
else
123123
return maybedate
124124
end

src/record.jl

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -95,12 +95,8 @@ end
9595
PARSE_SUCCESS
9696
end
9797

98-
@inline function setcell!(col::Array{String,1}, i, val::StrRange, str)
99-
col[i] = alloc_string(str, val)
100-
PARSE_SUCCESS
101-
end
102-
10398
@inline Base.@propagate_inbounds function setcell!(col::StringVector, i, val::StrRange, str)
99+
# TODO Properly handle the val.escapecount>0 case
104100
col[i] = WeakRefString(pointer(str, val.offset + 1), val.length)
105101
PARSE_SUCCESS
106102
end

src/utf8optimizations.jl

Lines changed: 113 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,4 +317,116 @@ function tryparsenext(q::Quoted{T,S,<:UInt8,<:UInt8}, str::Union{VectorBackedUTF
317317

318318
@label error
319319
return Nullable{T}(), i
320-
end
320+
end
321+
322+
@inline function isnewline(b::UInt8)
323+
b == UInt8(10) || b == UInt8(13)
324+
end
325+
326+
function tryparsenext(s::StringToken{T}, str::Union{VectorBackedUTF8String, String}, i, len, opts::LocalOpts{<:UInt8,<:UInt8,<:UInt8}) where {T}
327+
len = ncodeunits(str)
328+
inside_quoted_strong = opts.endchar == opts.quotechar
329+
escapecount = 0
330+
R = Nullable{T}
331+
p = UInt8(0)
332+
i0 = i
333+
if opts.includequotes
334+
if i<=len
335+
@inbounds b = codeunit(str, i)
336+
if b==opts.quotechar
337+
# advance counter so that
338+
# the while loop doesn't react to opening quote
339+
i += 1
340+
end
341+
end
342+
end
343+
344+
while i<=len
345+
@inbounds b = codeunit(str, i)
346+
ii = i + 1
347+
348+
if inside_quoted_strong && p==opts.escapechar
349+
escapecount += 1
350+
end
351+
352+
if opts.spacedelim && (b == UInt8(32) || b == UInt8(9)) # 32 = ' ' and 9 = '\t'
353+
break
354+
elseif !opts.spacedelim && b == opts.endchar
355+
if inside_quoted_strong
356+
# this means we're inside a quoted string
357+
if opts.quotechar == opts.escapechar
358+
# sometimes the quotechar is the escapechar
359+
# in that case we need to see the next char
360+
if ii > len
361+
if opts.includequotes
362+
i=ii
363+
end
364+
break
365+
else
366+
@inbounds next_b = codeunit(str, ii)
367+
if next_b == opts.quotechar
368+
# the current character is escaping the
369+
# next one
370+
i = ii + 1 # skip next char as well
371+
p = next_b
372+
continue
373+
end
374+
end
375+
elseif p == opts.escapechar
376+
# previous char escaped this one
377+
i = ii
378+
p = b
379+
continue
380+
end
381+
end
382+
if opts.includequotes
383+
i = ii
384+
end
385+
break
386+
elseif (!opts.includenewlines && isnewline(b))
387+
break
388+
end
389+
i = ii
390+
p = b
391+
end
392+
393+
return R(_substring(T, str, i0, i-1, escapecount, opts)), i
394+
end
395+
396+
@inline function _substring(::Type{String}, str::Union{VectorBackedUTF8String, String}, i, j, escapecount, opts::LocalOpts{<:UInt8,<:UInt8,<:UInt8})
397+
if escapecount > 0
398+
buffer = Vector{UInt8}(undef, j-i+1-escapecount)
399+
cur_i = i
400+
cur_buffer_i = 1
401+
@inbounds c = codeunit(str, cur_i)
402+
if opts.includequotes && c==opts.quotechar
403+
@inbounds buffer[cur_buffer_i] = c
404+
cur_i += 1
405+
cur_buffer_i += 1
406+
end
407+
while cur_i <= j
408+
@inbounds c = codeunit(str, cur_i)
409+
if c == opts.escapechar
410+
next_i = cur_i + 1
411+
if next_i <= j
412+
@inbounds next_c = codeunit(str, next_i)
413+
if next_c == opts.quotechar
414+
@inbounds buffer[cur_buffer_i] = next_c
415+
cur_buffer_i += 1
416+
cur_i = next_i
417+
end
418+
else
419+
@inbounds buffer[cur_buffer_i] = c
420+
cur_buffer_i += 1
421+
end
422+
else
423+
@inbounds buffer[cur_buffer_i] = c
424+
cur_buffer_i += 1
425+
end
426+
cur_i += 1
427+
end
428+
return String(buffer)
429+
else
430+
return unsafe_string(pointer(str, i), j-i+1)
431+
end
432+
end

0 commit comments

Comments
 (0)