Merge branch 'master' into escaped-strings

davidanthoff · davidanthoff · commit b5f79751bdca · 2019-03-17T15:00:08.000-07:00
diff --git a/src/VectorBackedStrings.jl b/src/VectorBackedStrings.jl
@@ -35,10 +35,10 @@ Base.@propagate_inbounds function Base.iterate(s::VectorBackedUTF8String, i::Int
     b = codeunit(s, i)
     u = UInt32(b) << 24
     Base.between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1
-    return Base.next_continued(s, i, u)
+    return our_next_continued(s, i, u)
 end
 
-function Base.next_continued(s::VectorBackedUTF8String, i::Int, u::UInt32)
+function our_next_continued(s::VectorBackedUTF8String, i::Int, u::UInt32)
     u < 0xc0000000 && (i += 1; @goto ret)
     n = ncodeunits(s)
     # first continuation byte
diff --git a/src/csv.jl b/src/csv.jl
@@ -66,6 +66,7 @@ Read CSV from `file`. Returns a tuple of 2 elements:
 - `spacedelim`: (Bool) parse space-delimited files. `delim` has no effect if true.
 - `quotechar`: character used to quote strings, defaults to `"`
 - `escapechar`: character used to escape quotechar in strings. (could be the same as quotechar)
+- `commentchar`: ignore lines that begin with commentchar
 - `nrows`: number of rows in the file. Defaults to `0` in which case we try to estimate this.
 - `skiplines_begin`: skips specified number of lines at the beginning of the file
 - `header_exists`: boolean specifying whether CSV file contains a header
@@ -157,6 +158,7 @@ function _csvread_internal(str::AbstractString, delim=',';
                  spacedelim=false,
                  quotechar='"',
                  escapechar='"',
+                 commentchar=nothing,
                  stringtype=String,
                  stringarraytype=StringArray,
                  noresize=false,
@@ -206,6 +208,11 @@ function _csvread_internal(str::AbstractString, delim=',';
         pos, lines = eatnewlines(str, pos)
         lineno += lines
     end
+
+    # Ignore commented lines before the header.
+    pos, lines = eatcommentlines(str, pos, len, commentchar)
+    lineno += lines
+
     if header_exists
         merged_colnames, pos = readcolnames(str, opts, pos, colnames)
         lineno += 1
@@ -235,8 +242,8 @@ function _csvread_internal(str::AbstractString, delim=',';
 
     # seed guesses using those from previous file
     guess, pos1 = guesscolparsers(str, canonnames, opts,
-                                  pos, type_detect_rows, colparsers,
-                                  stringarraytype, nastrings, prev_parsers)
+                                  pos, type_detect_rows, colparsers, stringarraytype, 
+                                  commentchar, nastrings, prev_parsers)
     if isempty(canonnames)
         canonnames = Any[1:length(guess);]
     end
@@ -319,7 +326,7 @@ function _csvread_internal(str::AbstractString, delim=',';
     @label retry
     try
         finalrows = parsefill!(str, opts, rec, nrows, cols, colspool,
-                               pos, lineno, rowno, lastindex(str))
+                               pos, lineno, rowno, lastindex(str), commentchar)
         if !noresize
             resizecols(colspool, finalrows)
         end
@@ -470,19 +477,19 @@ function readcolnames(str, opts, pos, colnames)
     colnames_inferred, lineend+1
 end
 
-
 function guesscolparsers(str::AbstractString, header, opts::LocalOpts, pos::Int,
-                       nrows::Int, colparsers, stringarraytype, nastrings=NA_STRINGS, prevs=nothing)
+                       nrows::Int, colparsers, stringarraytype, commentchar=nothing, nastrings=NA_STRINGS, prevs=nothing)
     # Field type guesses
     guess = []
     prevfields = String[]
 
     givenkeys = !isempty(colparsers) ? first.(collect(optionsiter(colparsers, header))) : []
     for i2=1:nrows
         pos, _ = eatnewlines(str, pos)
-        if pos > lastindex(str)
-            break
-        end
+
+        # Move past commented lines before guessing.
+        pos, _ = eatcommentlines(str, pos, lastindex(str), commentchar)
+        pos > lastindex(str) && break
 
         lineend = getrowend(str, pos, lastindex(str), opts, opts.endchar)
 
@@ -532,12 +539,19 @@ function guesscolparsers(str::AbstractString, header, opts::LocalOpts, pos::Int,
 end
 
 function parsefill!(str::AbstractString, opts, rec::RecN{N}, nrecs, cols, colspool,
-                    pos, lineno, rowno, l=lastindex(str)) where {N}
+                    pos, lineno, rowno, l=lastindex(str), commentchar=nothing) where {N}
     pos, lines = eatnewlines(str, pos, l)
     lineno += lines
+
     pos <= l && while true
         prev_j = pos
         lineno += lines
+
+        # Do not try to parse commented lines.
+        pos, lines = eatcommentlines(str, pos, l, commentchar)
+        lineno += lines
+        pos > l && return rowno-1
+
         res = tryparsesetindex(rec, str, pos, l, cols, rowno, opts)
         if !issuccess(res)
             pos, fieldpos, colno, err_code = geterror(res)
@@ -553,6 +567,7 @@ function parsefill!(str::AbstractString, opts, rec::RecN{N}, nrecs, cols, colspo
         if pos > l
             return rowno
         end
+
         rowno += 1
         lineno += 1
         if rowno > nrecs
diff --git a/src/util.jl b/src/util.jl
@@ -93,7 +93,7 @@ end
 end
 
 Base.@pure maxdigits(::Type{T}) where {T} = ndigits(typemax(T))
-Base.@pure min_with_max_digits(::Type{T}) where {T} = convert(T, 10^(ndigits(typemax(T))-1))
+Base.@pure min_with_max_digits(::Type{T}) where {T} = convert(T, T(10)^(maxdigits(T)-1))
 
 @inline function tryparsenext_base10(T, str,i,len)
     i0 = i
@@ -108,7 +108,7 @@ Base.@pure min_with_max_digits(::Type{T}) where {T} = convert(T, 10^(ndigits(typ
         y2 === nothing && return R(convert(T, 0)), i
         r = y2[1]; i = y2[2]
     end
-   
+
     digits = 1
     ten = T(10)
     while true
@@ -214,6 +214,23 @@ function eatnewlines(str, i=1, l=lastindex(str))
     return i, count
 end
 
+# Move past consecutive lines that start with commentchar.
+# Return a tuple of the new pos in str and the amount of comment lines moved past.
+function eatcommentlines(str, i=1, l=lastindex(str), commentchar::Union{Char, Nothing}=nothing) 
+    commentchar === nothing && return i, 0
+
+    count = 0
+    while i <= l && str[i] == commentchar
+        i = getlineend(str, i)
+        y = iterate(str, i)
+        y === nothing && return i, count
+        i = y[2]
+        i, lines = eatnewlines(str, i)
+        count += lines
+    end
+    return i, count
+end
+
 function stripquotes(x)
     x[1] in ('\'', '"') && x[1] == x[end] ?
         strip(x, x[1]) : x
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -111,7 +111,12 @@ end
     @test tryparsenext(fromtype(Int64), "9223372036854775807", 1, 19) |> unwrap == (9223372036854775807, 20)
     @test tryparsenext(fromtype(Int64), "9223372036854775808", 1, 19) |> failedat == 1
     @test tryparsenext(fromtype(Int64), "19223372036854775808", 1, 20) |> failedat == 1
-
+    @test tryparsenext(fromtype(UInt64), "18446744073709551615", 1, 20) |> unwrap == (0xffffffffffffffff, 21)
+    @test tryparsenext(fromtype(UInt64), "18446744073709551616", 1, 20) |> failedat == 1
+    @test tryparsenext(fromtype(Int128), "170141183460469231731687303715884105727", 1, 39) |> unwrap == (170141183460469231731687303715884105727, 40)
+    @test tryparsenext(fromtype(Int128), "170141183460469231731687303715884105728", 1, 39) |> failedat == 1
+    @test tryparsenext(fromtype(UInt128), "340282366920938463463374607431768211455", 1, 39) |> unwrap == (0xffffffffffffffffffffffffffffffff, 40)
+    @test tryparsenext(fromtype(UInt128), "340282366920938463463374607431768211456", 1, 39) |> failedat == 1
 end
 
 import TextParse: StringToken
@@ -554,6 +559,86 @@ import TextParse: _csvread
                    a""b"", 1""", stringarraytype=Array) == ((["a\"\"b\"\""], [1]), ["x\"\"y\"\"", "z"])
 end
 
+import TextParse: _csvread
+@testset "commentchar" begin
+
+    # First line a comment.
+    str1 = """
+    x,y,z
+    #1,1,1
+    2,2,2
+    """
+
+    @test _csvread(str1, commentchar='#') == (([2], [2], [2]), String["x", "y","z"])
+
+    # Last line a comment.
+    str2 = """
+    x,y,z
+    1,1,1
+    #2,2,2
+    """
+
+    @test _csvread(str2, commentchar='#') == (([1], [1], [1]), String["x", "y","z"])
+
+    # Multiple comments.
+    str3 = """
+    x,y,z
+    1,1,1
+    #2,2,2
+    #3,3,3
+    #4,4,4
+    5,5,5
+    #6,6,6
+    """
+
+    @test _csvread(str3, commentchar='#') == (([1, 5], [1, 5], [1, 5]), String["x", "y","z"])
+
+    # Comments before headers.
+    str4 = """
+    #foo
+    #bar
+    x,y,z
+    1,1,1
+    #2,2,2
+    """
+
+    @test _csvread(str4, commentchar='#') == (([1], [1], [1]), String["x", "y","z"])
+
+    # No comments.
+    str5 = """
+    x,y,z
+    1,1,1
+    2,2,2
+    """
+
+    @test _csvread(str5, commentchar='#') == (([1, 2], [1, 2], [1, 2]), String["x", "y","z"])
+
+    # Non-default comment.
+    str6 = """
+    %test
+    x,y,z
+    1,1,1
+    %2,2,2
+    2,2,2
+    """
+
+    @test _csvread(str6, commentchar='%') == (([1, 2], [1, 2], [1, 2]), String["x", "y","z"])
+
+    # Do not skip commented lines (commentchar=nothing).
+    str7 = """
+    x,y,z
+    1,1,1
+    #2,2,2
+    """
+
+    # Since we are not skipping commented lines the '#' character is considered 
+    # data. This will force parsing to treat columns with '#'s as String columns.
+    # Here, we verify this behavior.
+    result = _csvread(str7)
+    @test eltype(result[1][1]) == String
+    @test result == ((["1", "#2"], [1, 2], [1, 2]), String["x", "y","z"])
+end
+
 @testset "skiplines_begin" begin
     str1 = """
     hello