Skip to content

Commit f045e71

Browse files
committed
Add UTF8 optimized string parsing
1 parent 39811f2 commit f045e71

File tree

1 file changed

+74
-0
lines changed

1 file changed

+74
-0
lines changed

src/utf8optimizations.jl

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,80 @@ function tryparsenext(q::Quoted{T,S,<:UInt8,<:UInt8}, str::Union{VectorBackedUTF
319319
return Nullable{T}(), i
320320
end
321321

322+
@inline function isnewline(b::UInt8)
323+
b == UInt8(10) || b == UInt8(13)
324+
end
325+
326+
function tryparsenext(s::StringToken{T}, str::Union{VectorBackedUTF8String, String}, i, len, opts::LocalOpts{<:UInt8,<:UInt8,<:UInt8}) where {T}
327+
len = ncodeunits(str)
328+
inside_quoted_strong = opts.endchar == opts.quotechar
329+
escapecount = 0
330+
R = Nullable{T}
331+
p = UInt8(0)
332+
i0 = i
333+
if opts.includequotes
334+
if i<=len
335+
@inbounds b = codeunit(str, i)
336+
if b==opts.quotechar
337+
# advance counter so that
338+
# the while loop doesn't react to opening quote
339+
i += 1
340+
end
341+
end
342+
end
343+
344+
while i<=len
345+
@inbounds b = codeunit(str, i)
346+
ii = i + 1
347+
348+
if inside_quoted_strong && p==opts.escapechar
349+
escapecount += 1
350+
end
351+
352+
if opts.spacedelim && (b == UInt8(32) || b == UInt8(9)) # 32 = ' ' and 9 = '\t'
353+
break
354+
elseif !opts.spacedelim && b == opts.endchar
355+
if inside_quoted_strong
356+
# this means we're inside a quoted string
357+
if opts.quotechar == opts.escapechar
358+
# sometimes the quotechar is the escapechar
359+
# in that case we need to see the next char
360+
if ii > len
361+
if opts.includequotes
362+
i=ii
363+
end
364+
break
365+
else
366+
@inbounds next_b = codeunit(str, ii)
367+
if next_b == opts.quotechar
368+
# the current character is escaping the
369+
# next one
370+
i = ii + 1 # skip next char as well
371+
p = next_b
372+
continue
373+
end
374+
end
375+
elseif p == opts.escapechar
376+
# previous char escaped this one
377+
i = ii
378+
p = b
379+
continue
380+
end
381+
end
382+
if opts.includequotes
383+
i = ii
384+
end
385+
break
386+
elseif (!opts.includenewlines && isnewline(b))
387+
break
388+
end
389+
i = ii
390+
p = b
391+
end
392+
393+
return R(_substring(T, str, i0, i-1, escapecount, opts)), i
394+
end
395+
322396
@inline function _substring(::Type{String}, str::Union{VectorBackedUTF8String, String}, i, j, escapecount, opts::LocalOpts{<:UInt8,<:UInt8,<:UInt8})
323397
if escapecount > 0
324398
buffer = Vector{UInt8}(undef, j-i+1-escapecount)

0 commit comments

Comments
 (0)