Skip to content

Commit b5f7975

Browse files
committed
Merge branch 'master' into escaped-strings
2 parents f045e71 + d1e69b9 commit b5f7975

File tree

4 files changed

+131
-14
lines changed

4 files changed

+131
-14
lines changed

src/VectorBackedStrings.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,10 @@ Base.@propagate_inbounds function Base.iterate(s::VectorBackedUTF8String, i::Int
3535
b = codeunit(s, i)
3636
u = UInt32(b) << 24
3737
Base.between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1
38-
return Base.next_continued(s, i, u)
38+
return our_next_continued(s, i, u)
3939
end
4040

41-
function Base.next_continued(s::VectorBackedUTF8String, i::Int, u::UInt32)
41+
function our_next_continued(s::VectorBackedUTF8String, i::Int, u::UInt32)
4242
u < 0xc0000000 && (i += 1; @goto ret)
4343
n = ncodeunits(s)
4444
# first continuation byte

src/csv.jl

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ Read CSV from `file`. Returns a tuple of 2 elements:
6666
- `spacedelim`: (Bool) parse space-delimited files. `delim` has no effect if true.
6767
- `quotechar`: character used to quote strings, defaults to `"`
6868
- `escapechar`: character used to escape quotechar in strings. (could be the same as quotechar)
69+
- `commentchar`: ignore lines that begin with commentchar
6970
- `nrows`: number of rows in the file. Defaults to `0` in which case we try to estimate this.
7071
- `skiplines_begin`: skips specified number of lines at the beginning of the file
7172
- `header_exists`: boolean specifying whether CSV file contains a header
@@ -157,6 +158,7 @@ function _csvread_internal(str::AbstractString, delim=',';
157158
spacedelim=false,
158159
quotechar='"',
159160
escapechar='"',
161+
commentchar=nothing,
160162
stringtype=String,
161163
stringarraytype=StringArray,
162164
noresize=false,
@@ -206,6 +208,11 @@ function _csvread_internal(str::AbstractString, delim=',';
206208
pos, lines = eatnewlines(str, pos)
207209
lineno += lines
208210
end
211+
212+
# Ignore commented lines before the header.
213+
pos, lines = eatcommentlines(str, pos, len, commentchar)
214+
lineno += lines
215+
209216
if header_exists
210217
merged_colnames, pos = readcolnames(str, opts, pos, colnames)
211218
lineno += 1
@@ -235,8 +242,8 @@ function _csvread_internal(str::AbstractString, delim=',';
235242

236243
# seed guesses using those from previous file
237244
guess, pos1 = guesscolparsers(str, canonnames, opts,
238-
pos, type_detect_rows, colparsers,
239-
stringarraytype, nastrings, prev_parsers)
245+
pos, type_detect_rows, colparsers, stringarraytype,
246+
commentchar, nastrings, prev_parsers)
240247
if isempty(canonnames)
241248
canonnames = Any[1:length(guess);]
242249
end
@@ -319,7 +326,7 @@ function _csvread_internal(str::AbstractString, delim=',';
319326
@label retry
320327
try
321328
finalrows = parsefill!(str, opts, rec, nrows, cols, colspool,
322-
pos, lineno, rowno, lastindex(str))
329+
pos, lineno, rowno, lastindex(str), commentchar)
323330
if !noresize
324331
resizecols(colspool, finalrows)
325332
end
@@ -470,19 +477,19 @@ function readcolnames(str, opts, pos, colnames)
470477
colnames_inferred, lineend+1
471478
end
472479

473-
474480
function guesscolparsers(str::AbstractString, header, opts::LocalOpts, pos::Int,
475-
nrows::Int, colparsers, stringarraytype, nastrings=NA_STRINGS, prevs=nothing)
481+
nrows::Int, colparsers, stringarraytype, commentchar=nothing, nastrings=NA_STRINGS, prevs=nothing)
476482
# Field type guesses
477483
guess = []
478484
prevfields = String[]
479485

480486
givenkeys = !isempty(colparsers) ? first.(collect(optionsiter(colparsers, header))) : []
481487
for i2=1:nrows
482488
pos, _ = eatnewlines(str, pos)
483-
if pos > lastindex(str)
484-
break
485-
end
489+
490+
# Move past commented lines before guessing.
491+
pos, _ = eatcommentlines(str, pos, lastindex(str), commentchar)
492+
pos > lastindex(str) && break
486493

487494
lineend = getrowend(str, pos, lastindex(str), opts, opts.endchar)
488495

@@ -532,12 +539,19 @@ function guesscolparsers(str::AbstractString, header, opts::LocalOpts, pos::Int,
532539
end
533540

534541
function parsefill!(str::AbstractString, opts, rec::RecN{N}, nrecs, cols, colspool,
535-
pos, lineno, rowno, l=lastindex(str)) where {N}
542+
pos, lineno, rowno, l=lastindex(str), commentchar=nothing) where {N}
536543
pos, lines = eatnewlines(str, pos, l)
537544
lineno += lines
545+
538546
pos <= l && while true
539547
prev_j = pos
540548
lineno += lines
549+
550+
# Do not try to parse commented lines.
551+
pos, lines = eatcommentlines(str, pos, l, commentchar)
552+
lineno += lines
553+
pos > l && return rowno-1
554+
541555
res = tryparsesetindex(rec, str, pos, l, cols, rowno, opts)
542556
if !issuccess(res)
543557
pos, fieldpos, colno, err_code = geterror(res)
@@ -553,6 +567,7 @@ function parsefill!(str::AbstractString, opts, rec::RecN{N}, nrecs, cols, colspo
553567
if pos > l
554568
return rowno
555569
end
570+
556571
rowno += 1
557572
lineno += 1
558573
if rowno > nrecs

src/util.jl

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ end
9393
end
9494

9595
Base.@pure maxdigits(::Type{T}) where {T} = ndigits(typemax(T))
96-
Base.@pure min_with_max_digits(::Type{T}) where {T} = convert(T, 10^(ndigits(typemax(T))-1))
96+
Base.@pure min_with_max_digits(::Type{T}) where {T} = convert(T, T(10)^(maxdigits(T)-1))
9797

9898
@inline function tryparsenext_base10(T, str,i,len)
9999
i0 = i
@@ -108,7 +108,7 @@ Base.@pure min_with_max_digits(::Type{T}) where {T} = convert(T, 10^(ndigits(typ
108108
y2 === nothing && return R(convert(T, 0)), i
109109
r = y2[1]; i = y2[2]
110110
end
111-
111+
112112
digits = 1
113113
ten = T(10)
114114
while true
@@ -214,6 +214,23 @@ function eatnewlines(str, i=1, l=lastindex(str))
214214
return i, count
215215
end
216216

217+
# Move past consecutive lines that start with commentchar.
218+
# Return a tuple of the new pos in str and the amount of comment lines moved past.
219+
function eatcommentlines(str, i=1, l=lastindex(str), commentchar::Union{Char, Nothing}=nothing)
220+
commentchar === nothing && return i, 0
221+
222+
count = 0
223+
while i <= l && str[i] == commentchar
224+
i = getlineend(str, i)
225+
y = iterate(str, i)
226+
y === nothing && return i, count
227+
i = y[2]
228+
i, lines = eatnewlines(str, i)
229+
count += lines
230+
end
231+
return i, count
232+
end
233+
217234
function stripquotes(x)
218235
x[1] in ('\'', '"') && x[1] == x[end] ?
219236
strip(x, x[1]) : x

test/runtests.jl

Lines changed: 86 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,12 @@ end
111111
@test tryparsenext(fromtype(Int64), "9223372036854775807", 1, 19) |> unwrap == (9223372036854775807, 20)
112112
@test tryparsenext(fromtype(Int64), "9223372036854775808", 1, 19) |> failedat == 1
113113
@test tryparsenext(fromtype(Int64), "19223372036854775808", 1, 20) |> failedat == 1
114-
114+
@test tryparsenext(fromtype(UInt64), "18446744073709551615", 1, 20) |> unwrap == (0xffffffffffffffff, 21)
115+
@test tryparsenext(fromtype(UInt64), "18446744073709551616", 1, 20) |> failedat == 1
116+
@test tryparsenext(fromtype(Int128), "170141183460469231731687303715884105727", 1, 39) |> unwrap == (170141183460469231731687303715884105727, 40)
117+
@test tryparsenext(fromtype(Int128), "170141183460469231731687303715884105728", 1, 39) |> failedat == 1
118+
@test tryparsenext(fromtype(UInt128), "340282366920938463463374607431768211455", 1, 39) |> unwrap == (0xffffffffffffffffffffffffffffffff, 40)
119+
@test tryparsenext(fromtype(UInt128), "340282366920938463463374607431768211456", 1, 39) |> failedat == 1
115120
end
116121

117122
import TextParse: StringToken
@@ -554,6 +559,86 @@ import TextParse: _csvread
554559
a""b"", 1""", stringarraytype=Array) == ((["a\"\"b\"\""], [1]), ["x\"\"y\"\"", "z"])
555560
end
556561

562+
import TextParse: _csvread
563+
@testset "commentchar" begin
564+
565+
# First line a comment.
566+
str1 = """
567+
x,y,z
568+
#1,1,1
569+
2,2,2
570+
"""
571+
572+
@test _csvread(str1, commentchar='#') == (([2], [2], [2]), String["x", "y","z"])
573+
574+
# Last line a comment.
575+
str2 = """
576+
x,y,z
577+
1,1,1
578+
#2,2,2
579+
"""
580+
581+
@test _csvread(str2, commentchar='#') == (([1], [1], [1]), String["x", "y","z"])
582+
583+
# Multiple comments.
584+
str3 = """
585+
x,y,z
586+
1,1,1
587+
#2,2,2
588+
#3,3,3
589+
#4,4,4
590+
5,5,5
591+
#6,6,6
592+
"""
593+
594+
@test _csvread(str3, commentchar='#') == (([1, 5], [1, 5], [1, 5]), String["x", "y","z"])
595+
596+
# Comments before headers.
597+
str4 = """
598+
#foo
599+
#bar
600+
x,y,z
601+
1,1,1
602+
#2,2,2
603+
"""
604+
605+
@test _csvread(str4, commentchar='#') == (([1], [1], [1]), String["x", "y","z"])
606+
607+
# No comments.
608+
str5 = """
609+
x,y,z
610+
1,1,1
611+
2,2,2
612+
"""
613+
614+
@test _csvread(str5, commentchar='#') == (([1, 2], [1, 2], [1, 2]), String["x", "y","z"])
615+
616+
# Non-default comment.
617+
str6 = """
618+
%test
619+
x,y,z
620+
1,1,1
621+
%2,2,2
622+
2,2,2
623+
"""
624+
625+
@test _csvread(str6, commentchar='%') == (([1, 2], [1, 2], [1, 2]), String["x", "y","z"])
626+
627+
# Do not skip commented lines (commentchar=nothing).
628+
str7 = """
629+
x,y,z
630+
1,1,1
631+
#2,2,2
632+
"""
633+
634+
# Since we are not skipping commented lines the '#' character is considered
635+
# data. This will force parsing to treat columns with '#'s as String columns.
636+
# Here, we verify this behavior.
637+
result = _csvread(str7)
638+
@test eltype(result[1][1]) == String
639+
@test result == ((["1", "#2"], [1, 2], [1, 2]), String["x", "y","z"])
640+
end
641+
557642
@testset "skiplines_begin" begin
558643
str1 = """
559644
hello

0 commit comments

Comments
 (0)