Skip to content
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
e03b0e4
fix textloader bug on quotes
LittleLittleCloud Dec 17, 2019
3ddb3b0
use static method instead of extension method
LittleLittleCloud Dec 19, 2019
9b20ead
better name
LittleLittleCloud Dec 19, 2019
34d9efa
Merge branch 'master' into u/xiaoyun/fixTextLoaderBug
LittleLittleCloud Apr 15, 2020
ac6f971
Merge remote-tracking branch 'xiaoyun/u/xiaoyun/fixTextLoaderBug' int…
antoniovs1029 May 11, 2020
8304d62
Moved multiline reader to TextLoaderCursor, and added return when mul…
antoniovs1029 May 13, 2020
a9e91e2
Fix issue with empty unquoted new lines
antoniovs1029 May 13, 2020
9cfaee1
Make ReadMultiLine a little bit more efficient
antoniovs1029 May 13, 2020
2827c61
Add clarifying comment in FetchNextField
antoniovs1029 May 13, 2020
a427662
Added test
antoniovs1029 May 13, 2020
1cafbf0
Added test for column inference
antoniovs1029 May 14, 2020
6116d97
Make multilinereader a little bit more efficient
antoniovs1029 May 14, 2020
df2ca25
Create read multilines parameter
antoniovs1029 May 14, 2020
530f41e
Make tests run with readMultilines parameter
antoniovs1029 May 14, 2020
d9af9d2
Remove public parameter for readMultilines, as it is considered a bre…
antoniovs1029 May 15, 2020
6a7b632
Update manifest
antoniovs1029 May 15, 2020
fb6ab28
Update src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs
antoniovs1029 May 15, 2020
f592a7f
Update src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs
antoniovs1029 May 15, 2020
a13b803
Removed new readMultiline option from AutoML.NET and removed the test…
antoniovs1029 May 15, 2020
c2d2ac7
Merge branch 'is4460TextLoaderQuoting' of https://github.com/antoniov…
antoniovs1029 May 15, 2020
13033bf
Actually add new line characters to loaded strings
antoniovs1029 May 16, 2020
2983b06
Actually include new line characters in loaded strings
antoniovs1029 May 16, 2020
9789479
Let the TextSaver also correctly save new lines inside quoted fields
antoniovs1029 May 18, 2020
60c4169
Fixed bug when calling GetSomeLines
antoniovs1029 May 18, 2020
fa28ddd
Throw exception when reaching EOF without ending quote on a given field.
antoniovs1029 May 19, 2020
56279b4
Refactored readMultilines into OptionsFlags
antoniovs1029 May 19, 2020
51d9390
Added test to check new readMultiline option from MAML.
antoniovs1029 May 19, 2020
5cc7512
Merge remote-tracking branch 'upstream/master' into is4460TextLoaderQ…
antoniovs1029 May 19, 2020
5be5f6f
Fixed mistake on an unrelated test
antoniovs1029 May 19, 2020
b4e3029
Added internal default to ReadMultilines
antoniovs1029 May 19, 2020
7e16fc7
Do more checking in MultilineReader in order to be more flexible in a…
antoniovs1029 May 19, 2020
f0652a5
Updated tests
antoniovs1029 May 19, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,13 @@ public class Options
[Argument(ArgumentType.AtMostOnce, HelpText = "Use separate parsing threads?", ShortName = "threads", Hide = true)]
public bool UseThreads = true;

/// <summary>
/// If true, new line characters are acceptable inside a quoted field, and thus one field can have multiple lines of text inside it
/// If <see cref="TextLoader.Options.AllowQuoting"/> is false, this option is ignored.
/// </summary>
[Argument(ArgumentType.AtMostOnce, HelpText = "Escape new line characters inside a quoted field? If AllowQuoting is false, this argument is ignored.", ShortName = "multilines", Hide = true)]
Comment thread
antoniovs1029 marked this conversation as resolved.
public bool ReadMultilines = false;

/// <summary>
/// File containing a header with feature names. If specified, the header defined in the data file is ignored regardless of <see cref="HasHeader"/>.
/// </summary>
Expand Down Expand Up @@ -694,11 +701,11 @@ public Bindings(TextLoader parent, Column[] cols, IMultiStreamSource headerFile,
ch.Assert(0 <= inputSize & inputSize < SrcLim);
List<ReadOnlyMemory<char>> lines = null;
if (headerFile != null)
Cursor.GetSomeLines(headerFile, 1, ref lines);
Cursor.GetSomeLines(headerFile, 1, false, ref lines);
if (needInputSize && inputSize == 0)
Cursor.GetSomeLines(dataSample, 100, ref lines);
Cursor.GetSomeLines(dataSample, 100, parent._readMultilines,ref lines);
else if (headerFile == null && parent.HasHeader)
Cursor.GetSomeLines(dataSample, 1, ref lines);
Cursor.GetSomeLines(dataSample, 1, false, ref lines);

if (needInputSize && inputSize == 0)
{
Expand Down Expand Up @@ -1081,6 +1088,7 @@ private enum OptionFlags : uint
private const int SrcLim = int.MaxValue;

private readonly bool _useThreads;
private readonly bool _readMultilines;
private readonly OptionFlags _flags;
private readonly long _maxRows;
// Input size is zero for unknown - determined by the data (including sparse rows).
Expand Down Expand Up @@ -1138,6 +1146,7 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo
_host.Assert(Utils.Size(cols) > 0);

_useThreads = options.UseThreads;
_readMultilines = options.AllowQuoting ? options.ReadMultilines : false;

if (options.TrimWhitespace)
_flags |= OptionFlags.TrimWhitespace;
Expand Down Expand Up @@ -1350,6 +1359,8 @@ private TextLoader(IHost host, ModelLoadContext ctx)

// REVIEW: Should we serialize this? It really isn't part of the data model.
_useThreads = true;
// MYTODO: Should we serialize this? probably yes and also include it in Flags
_readMultilines = false;
Comment thread
antoniovs1029 marked this conversation as resolved.
Outdated

// *** Binary format ***
// int: sizeof(Float)
Expand Down
85 changes: 78 additions & 7 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
Expand Down Expand Up @@ -145,7 +146,7 @@ public static DataViewRowCursor Create(TextLoader parent, IMultiStreamSource fil
SetupCursor(parent, active, 0, out srcNeeded, out cthd);
Contracts.Assert(cthd > 0);

var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._maxRows, 1);
var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._readMultilines, parent._maxRows, 1);
var stats = new ParseStats(parent._host, 1);
return new Cursor(parent, stats, active, reader, srcNeeded, cthd);
}
Expand All @@ -162,7 +163,7 @@ public static DataViewRowCursor[] CreateSet(TextLoader parent, IMultiStreamSourc
SetupCursor(parent, active, n, out srcNeeded, out cthd);
Contracts.Assert(cthd > 0);

var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._maxRows, cthd);
var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._readMultilines, parent._maxRows, cthd);
var stats = new ParseStats(parent._host, cthd);
if (cthd <= 1)
return new DataViewRowCursor[1] { new Cursor(parent, stats, active, reader, srcNeeded, 1) };
Expand Down Expand Up @@ -204,7 +205,7 @@ public override ValueGetter<DataViewRowId> GetIdGetter()
};
}

public static void GetSomeLines(IMultiStreamSource source, int count, ref List<ReadOnlyMemory<char>> lines)
public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, ref List<ReadOnlyMemory<char>> lines)
{
Contracts.AssertValue(source);
Contracts.Assert(count > 0);
Expand All @@ -214,7 +215,7 @@ public static void GetSomeLines(IMultiStreamSource source, int count, ref List<R
count = 2;

LineBatch batch;
var reader = new LineReader(source, count, 1, false, count, 1);
var reader = new LineReader(source, count, 1, false, readMultilines, count, 1);
try
{
batch = reader.GetBatch();
Expand Down Expand Up @@ -401,6 +402,7 @@ private sealed class LineReader
{
private readonly long _limit;
private readonly bool _hasHeader;
private readonly bool _readMultilines;
private readonly int _batchSize;
private readonly IMultiStreamSource _files;

Expand All @@ -410,7 +412,7 @@ private sealed class LineReader
private Task _thdRead;
private volatile bool _abort;

public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, long limit, int cref)
public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, long limit, int cref)
{
// Note that files is allowed to be empty.
Contracts.AssertValue(files);
Expand All @@ -423,6 +425,7 @@ public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool has
_limit = limit;
_hasHeader = hasHeader;
_batchSize = batchSize;
_readMultilines = readMultilines;
_files = files;
_cref = cref;

Expand Down Expand Up @@ -464,9 +467,69 @@ public LineBatch GetBatch()
throw Contracts.ExceptDecode(batch.Exception, "Stream reading encountered exception");
}

private static class MultiLineReader
{
// When reading lines that contain quoted fields, the quoted fields can contain
// '\n' so we we'll need to read multiple lines (multilines) to get all the fields
// of a given row.
public static string ReadMultiLine(TextReader sr, StringBuilder sb, bool ignoreHashLine)
{
string line;
line = sr.ReadLine();

// if it was an empty line or if we've reached the end of file (i.e. line = null)
if (string.IsNullOrEmpty(line))
return line;

// In ML.NET we filter out lines beginning with // and # at the beginning of the file
// Or lines beginning with // elsewhere in the file.
// Thus, we don't care to check if there's a quoted multiline when the line begins with
// these chars.
if (line[0] == '/' && line[1] == '/')
return line;
if (ignoreHashLine && line[0] == '#')
return line;

@justinormont justinormont May 15, 2020

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should these check if you're at the top of the file? I think once you've hit the first row of data, no more ignoring of rows is allowed. #Resolved

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Current behavior is that at the beginning of the file both // and # are ignored by LineReader link, but after the first row of data is hit, LineReader only ignores // link

Because of this, I actually added the ignoreHashLine flag on this method, and use it on the code you've pointed to.

One of the tests I've added on this PR actually shows that "// lines" are ignored correctly throughout the file.


In reply to: 425748062 [](ancestors = 425748062)


// Get more lines until the number of quote characters is even
// 2 consecutive quotes are considered scaped quotes
long numOfQuotes = GetNumberOfChars(line, '"');
if (numOfQuotes % 2 == 0)
Comment thread
antoniovs1029 marked this conversation as resolved.
Outdated
return line;

sb.Clear();
sb.Append(line);
while (numOfQuotes % 2 != 0)

@justinormont justinormont May 19, 2020

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How would you handle the case

Label,Size,Age
true,5.4",5

I think we need to check that a quoted field begins with the quote character (vs only includes ones). If the field isn't quoted, it can include non-escaped quote chars. If it's quoted, all quotes must be escaped within the field. #Resolved

@antoniovs1029 antoniovs1029 May 19, 2020

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to the RFC your row isn't valid, as " should always be either escaped, or be used to indicate a quoted field. So I assumed that datasets should comply with RFC when using ReadMultiline, to make the MultilineReader code less complicated (and faster).

Should we actually accept this even when ReadMultiline is true? #Resolved

@antoniovs1029 antoniovs1029 May 19, 2020

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mmm ok, after thinking about this a bit, I will anyways have to change the logic in this method for the escapeChar feature, so I might just as well allow some subset of cases similar to the one you pointed out (i.e. using quotes inside an unquoted field, and not counting them as invalid quotes). I guess it's worth it to avoid users hitting the EOF exception and/or loading much of the text into memory which are things that would occur with invalid CSVs such as this one if I don't take special care of this kind of case.


In reply to: 427003836 [](ancestors = 427003836)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've done this. Let us follow this thread here:
#5125 (comment)


In reply to: 427010189 [](ancestors = 427010189,427003836)

{
line = sr.ReadLine();

if (line == null) // If we've reached the end of the file
break; // MYTODO: This could happen if we have an invalid open quote which never closes so we reach the end of the file without properly closing the field, should we throw instead in this case?
Comment thread
antoniovs1029 marked this conversation as resolved.
Outdated

if (line.Length != 0)
sb.Append(" "); // MYTODO: should we use instead a "\n" in here to separate lines?

@antoniovs1029 antoniovs1029 May 14, 2020

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@justinormont recommended adding an actual new line here instead of a space. But I'm not sure if it would matter much. Also I wouldn't know when to append "\n" instead of "\r\n", or if it would make any difference. #Resolved

@justinormont justinormont May 15, 2020

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep. We'll want newlines instead of spaces.

The same as newlines are different to human readers, the ML can just as well make use of this information. We also have no control over how the user featurizes their dataset in the ML pipeline; the newlines could be important to their processing.

We also would want to maintain round trip read/write of the dataset. Which means we'd have to modify the TextSaver to support (or verify current support) for newlines. #Resolved

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've done this in a more recent iteration. To continue discussing about this, it'd be better to do it here:
#5125 (comment)


In reply to: 425757137 [](ancestors = 425757137)


sb.Append(line);
numOfQuotes += GetNumberOfChars(line, '"');
}

return sb.ToString();
}

public static int GetNumberOfChars(string line, char ch)
{

@harishsk harishsk May 14, 2020

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is line.ToCharArray().Count(c => c == ch) a better approach? #Resolved

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought about it, but I read here that actually the most efficient way is to iterate over the chars:
https://stackoverflow.com/a/10391548


In reply to: 425452787 [](ancestors = 425452787)

@justinormont justinormont May 15, 2020

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If perf testing shows excessive time spent in this function, you could argument the existing C++/assembly intrinsics w/ a character counter. I think _mm256_cmpeq_epi8 would work to accelerate the comparison. #Resolved

int count = 0;
foreach (char c in line)
{
if (c == ch) count++;
}
return count;
}
}

private void ThreadProc()
{
Contracts.Assert(_batchSize >= 2);
var multilineSB = new StringBuilder();

try
{
Expand All @@ -487,7 +550,11 @@ private void ThreadProc()
// REVIEW: Avoid allocating a string for every line. This would probably require
// introducing a CharSpan type (similar to ReadOnlyMemory but based on char[] or StringBuilder)
// and implementing all the necessary conversion functionality on it. See task 3871.
text = rdr.ReadLine();
if (_readMultilines)
text = MultiLineReader.ReadMultiLine(rdr, multilineSB, true);
else
text = rdr.ReadLine();

if (text == null)
goto LNext;
line++;
Expand All @@ -514,7 +581,11 @@ private void ThreadProc()
if (_abort)
return;

text = rdr.ReadLine();
if (_readMultilines)
text = MultiLineReader.ReadMultiLine(rdr, multilineSB, true);
else
text = rdr.ReadLine();

if (text == null)
{
// We're done with this file. Queue the last partial batch.
Expand Down
5 changes: 5 additions & 0 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1161,6 +1161,11 @@ private bool FetchNextField(ref ScanInfo scan, ReadOnlySpan<char> span)
scan.QuotingError = true;
break;
}

// The logic below allow us to escape quotes (") inside quoted
// fields by using doublo quotes (""). I.e. when the loader
// encounters "" inside a quoted field, it will output only one "
// and continue parsing the rest of the field.
if (span[ichCur] == '"')

@justinormont justinormont May 15, 2020

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might want to expose this character as a parameter. The other common choice is backslash escaping. #Resolved

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To implement the feature you're suggesting on this comment I do believe that we only need to change the code you've pointed to in here, so it's kind of straightforward.

Right now I am going to focus on making sure this PR works well with AutoML/Modelbuilder, that TextSaver works with new lines and that we handle correctly the case of having a badly formatted input file. If I can tackle this with enough time before the next release, then I will try to implement this suggestion.


In reply to: 425777440 [](ancestors = 425777440)

@antoniovs1029 antoniovs1029 May 15, 2020

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Change in plans, I won't address surfacing the new readMultiline option to AutoML/ModelBuilder on this PR. So now I will actually focus on the other things, including implementing this escapechar option before the next release. See #5125 (comment)


In reply to: 425777440 [](ancestors = 425777440)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note to myself: I've just realized that to implement the escapechar feature, then I'll also need to change the logic in the MultilineReader, because if the escapechar isn't " then checking for even number of quotes won't be enough.


In reply to: 425777440 [](ancestors = 425777440)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As discussed offline, I'll address this in an upcoming PR.


In reply to: 426128704 [](ancestors = 426128704,425777440)

{
if (ichCur > ichRun)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
HasHeader = hasHeader,
AllowQuoting = allowQuoting,
TrimWhitespace = trimWhitespace,
AllowSparse = allowSparse
AllowSparse = allowSparse,
};

return new TextLoader(CatalogUtils.GetEnvironment(catalog), options: options, dataSample: dataSample);
Expand Down
12 changes: 12 additions & 0 deletions test/BaselineOutput/Common/EntryPoints/core_manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,18 @@
"IsNullable": false,
"Default": true
},
{
"Name": "ReadMultilines",
"Type": "Bool",
"Desc": "Escape new line characters inside a quoted field? If AllowQuoting is false, this argument is ignored.",
"Aliases": [
"multilines"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
"Default": false
},
{
"Name": "HeaderFile",
"Type": "String",
Expand Down
33 changes: 33 additions & 0 deletions test/BaselineOutput/Common/TextLoader/multiline.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
10
11
12
13
14
15
16
17
0
19
20
this is a description
this is a quoted description
this is a multiline quoted description
this has one"doublequote which should be escaped as a single quote
this has "doublequotes" inside of it
this is a multiline quoted description with "doublequotes" and empty new lines and escaped quotes inside of "it" //and this comment with // shouldn't be ignored since it is part of the "multiline"
here is text after the empty line
this is a line with an empty animal
this is a line with an empty id

this is the last row description
dog
cat
bird
dog
cat
bird
dog

bird
dog
cat
79 changes: 78 additions & 1 deletion test/Microsoft.ML.Tests/TextLoaderTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -660,7 +660,7 @@ public class ModelWithColumnNameAttribute
}
}

public class TextLoaderFromModelTests : BaseTestClass
public class TextLoaderFromModelTests : BaseTestBaseline
{
public TextLoaderFromModelTests(ITestOutputHelper output)
: base(output)
Expand Down Expand Up @@ -920,5 +920,82 @@ public void TestLoadTextWithoutKeyTypeAttribute()

Assert.Equal(expectedCount, data.Schema[1].Type.GetKeyCount());
}

[Fact]
public void TestLoadTextWithEscapedNewLines()
{
var mlContext = new MLContext(seed: 1);
var dataPath = GetDataPath("multiline.csv");
var baselinePath = GetBaselinePath("TextLoader", "multiline.csv");
var options = new TextLoader.Options()
{
HasHeader = true,
Separator = ",",
AllowQuoting = true,
ReadMultilines = true,
Columns = new[]
{
new TextLoader.Column("id", DataKind.Int32, 0),
new TextLoader.Column("description", DataKind.String, 1),
new TextLoader.Column("animal", DataKind.String, 2),
},
};

var data = mlContext.Data.LoadFromTextFile(dataPath, options);

// Get values from loaded dataview
var ids = new List<string>();
var descriptions = new List<string>();
var animals = new List<string>();
using(var curs = data.GetRowCursorForAllColumns())
{
var idGetter = curs.GetGetter<int>(data.Schema["id"]);
var descriptionGetter = curs.GetGetter<ReadOnlyMemory<char>>(data.Schema["description"]);
var animalGetter = curs.GetGetter<ReadOnlyMemory<char>>(data.Schema["animal"]);

int id = default;
ReadOnlyMemory<char> description = default;
ReadOnlyMemory<char> animal = default;

while(curs.MoveNext())
{
idGetter(ref id);
descriptionGetter(ref description);
animalGetter(ref animal);

ids.Add(id.ToString());
descriptions.Add(description.ToString());
animals.Add(animal.ToString());
}
}

const int numRows = 11;
Assert.Equal(numRows, ids.Count());
Assert.Equal(numRows, descriptions.Count());
Assert.Equal(numRows, animals.Count());

// Compare values with baseline file
string line;
using (var file = new StreamReader(baselinePath))
{
for(int i = 0; i < numRows; i++)
{
line = file.ReadLine();
Assert.Equal(ids[i], line);
}

for (int i = 0; i < numRows; i++)
{
line = file.ReadLine();
Assert.Equal(descriptions[i], line);
}

for (int i = 0; i < numRows; i++)
{
line = file.ReadLine();
Assert.Equal(animals[i], line);
}
}
}
}
}
Loading