From e03b0e4a73094c576f8d5ae8b72065a4d014290a Mon Sep 17 00:00:00 2001 From: LittleLittleCloud Date: Tue, 17 Dec 2019 15:52:22 -0800 Subject: [PATCH 01/28] fix textloader bug on quotes --- .../DataLoadSave/Text/TextLoaderCursor.cs | 4 +-- .../Utilities/StreamUtils.cs | 35 +++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs index 4394b26121..76e310bef6 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs @@ -487,7 +487,7 @@ private void ThreadProc() // REVIEW: Avoid allocating a string for every line. This would probably require // introducing a CharSpan type (similar to ReadOnlyMemory but based on char[] or StringBuilder) // and implementing all the necessary conversion functionality on it. See task 3871. - text = rdr.ReadLine(); + text = rdr.ReadEntry(); if (text == null) goto LNext; line++; @@ -514,7 +514,7 @@ private void ThreadProc() if (_abort) return; - text = rdr.ReadLine(); + text = rdr.ReadEntry(); if (text == null) { // We're done with this file. Queue the last partial batch. diff --git a/src/Microsoft.ML.Data/Utilities/StreamUtils.cs b/src/Microsoft.ML.Data/Utilities/StreamUtils.cs index f157d09ea8..eba6559938 100644 --- a/src/Microsoft.ML.Data/Utilities/StreamUtils.cs +++ b/src/Microsoft.ML.Data/Utilities/StreamUtils.cs @@ -174,5 +174,40 @@ private static string[] Expand(string pattern) return matchList.ToArray(); } #endif + + public static string ReadEntry(this TextReader sr) + { + string strReturn = string.Empty; + + // get first bit + strReturn += sr.ReadLine(); + + // And get more lines until the number of quotes is even + while (strReturn.GetNumberOf("\"").IsOdd()) + { + string strNow = sr.ReadLine(); + strReturn += strNow; + } + + // Then return what we've gotten + if (strReturn == string.Empty) + { + return null; + } + else + { + return strReturn; + } + } + + public static int GetNumberOf(this string s, string strSearchString) + { + return s.Length - s.Replace(strSearchString, string.Empty).Length; + } + + public static bool IsOdd(this int i) + { + return i % 2 != 0; + } } } From 3ddb3b08f8e26accadebb3ef59e2e8eebe6959fc Mon Sep 17 00:00:00 2001 From: LittleLittleCloud Date: Thu, 19 Dec 2019 10:30:17 -0800 Subject: [PATCH 02/28] use static method instead of extension method --- src/Microsoft.ML.Data/Utilities/StreamUtils.cs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/Microsoft.ML.Data/Utilities/StreamUtils.cs b/src/Microsoft.ML.Data/Utilities/StreamUtils.cs index eba6559938..bbc74f6844 100644 --- a/src/Microsoft.ML.Data/Utilities/StreamUtils.cs +++ b/src/Microsoft.ML.Data/Utilities/StreamUtils.cs @@ -183,7 +183,7 @@ public static string ReadEntry(this TextReader sr) strReturn += sr.ReadLine(); // And get more lines until the number of quotes is even - while (strReturn.GetNumberOf("\"").IsOdd()) + while (GetNumberOf(strReturn, "\"") % 2 != 0 ) { string strNow = sr.ReadLine(); strReturn += strNow; @@ -200,14 +200,13 @@ public static string ReadEntry(this TextReader sr) } } - public static int GetNumberOf(this string s, string strSearchString) + public static int GetNumberOf(string s, string strSearchString) { - return s.Length - s.Replace(strSearchString, string.Empty).Length; - } - - public static bool IsOdd(this int i) - { - return i % 2 != 0; + if(strSearchString.Length == 0 || s.Length == 0) + { + return 0; + } + return (s.Length - s.Replace(strSearchString, string.Empty).Length) / strSearchString.Length; } } } From 9b20ead4edbcd4e0a2950c0dcde6ae00f2b6ccf3 Mon Sep 17 00:00:00 2001 From: LittleLittleCloud Date: Thu, 19 Dec 2019 10:32:10 -0800 Subject: [PATCH 03/28] better name --- src/Microsoft.ML.Data/Utilities/StreamUtils.cs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/Microsoft.ML.Data/Utilities/StreamUtils.cs b/src/Microsoft.ML.Data/Utilities/StreamUtils.cs index bbc74f6844..32ee6c774f 100644 --- a/src/Microsoft.ML.Data/Utilities/StreamUtils.cs +++ b/src/Microsoft.ML.Data/Utilities/StreamUtils.cs @@ -177,26 +177,26 @@ private static string[] Expand(string pattern) public static string ReadEntry(this TextReader sr) { - string strReturn = string.Empty; + string entry = string.Empty; // get first bit - strReturn += sr.ReadLine(); + entry += sr.ReadLine(); // And get more lines until the number of quotes is even - while (GetNumberOf(strReturn, "\"") % 2 != 0 ) + while (GetNumberOf(entry, "\"") % 2 != 0 ) { - string strNow = sr.ReadLine(); - strReturn += strNow; + string line = sr.ReadLine(); + entry += line; } // Then return what we've gotten - if (strReturn == string.Empty) + if (entry == string.Empty) { return null; } else { - return strReturn; + return entry; } } From 8304d6258bd2bbfe3de419562edd3c8830fa9756 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Wed, 13 May 2020 01:02:00 -0700 Subject: [PATCH 04/28] Moved multiline reader to TextLoaderCursor, and added return when multiline doesn't close quoting --- .../DataLoadSave/Text/TextLoaderCursor.cs | 45 ++++++++++++++++++- .../Utilities/StreamUtils.cs | 34 -------------- 2 files changed, 43 insertions(+), 36 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs index ab72ec3d57..e2c34f097b 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs @@ -4,6 +4,7 @@ using System; using System.Collections.Generic; +using System.IO; using System.Text; using System.Threading; using System.Threading.Tasks; @@ -464,6 +465,46 @@ public LineBatch GetBatch() throw Contracts.ExceptDecode(batch.Exception, "Stream reading encountered exception"); } + private static class MultiLineReader + { + public static string ReadMultiLine(TextReader sr) + { + string entry = string.Empty; + + // get first bit + entry += sr.ReadLine(); + + // And get more lines until the number of quotes is even + while (GetNumberOf(entry, "\"") % 2 != 0) + { + string line = sr.ReadLine(); + entry += line; + if (line == null) + return entry; + } + + // Then return what we've gotten + if (entry == string.Empty) + { + return null; + } + else + { + return entry; + } + } + + public static int GetNumberOf(string s, string strSearchString) + { + if (strSearchString.Length == 0 || s.Length == 0) + { + return 0; + } + return (s.Length - s.Replace(strSearchString, string.Empty).Length) / strSearchString.Length; + } + + } + private void ThreadProc() { Contracts.Assert(_batchSize >= 2); @@ -487,7 +528,7 @@ private void ThreadProc() // REVIEW: Avoid allocating a string for every line. This would probably require // introducing a CharSpan type (similar to ReadOnlyMemory but based on char[] or StringBuilder) // and implementing all the necessary conversion functionality on it. See task 3871. - text = rdr.ReadEntry(); + text = MultiLineReader.ReadMultiLine(rdr); if (text == null) goto LNext; line++; @@ -514,7 +555,7 @@ private void ThreadProc() if (_abort) return; - text = rdr.ReadEntry(); + text = MultiLineReader.ReadMultiLine(rdr); if (text == null) { // We're done with this file. Queue the last partial batch. diff --git a/src/Microsoft.ML.Data/Utilities/StreamUtils.cs b/src/Microsoft.ML.Data/Utilities/StreamUtils.cs index 32ee6c774f..f157d09ea8 100644 --- a/src/Microsoft.ML.Data/Utilities/StreamUtils.cs +++ b/src/Microsoft.ML.Data/Utilities/StreamUtils.cs @@ -174,39 +174,5 @@ private static string[] Expand(string pattern) return matchList.ToArray(); } #endif - - public static string ReadEntry(this TextReader sr) - { - string entry = string.Empty; - - // get first bit - entry += sr.ReadLine(); - - // And get more lines until the number of quotes is even - while (GetNumberOf(entry, "\"") % 2 != 0 ) - { - string line = sr.ReadLine(); - entry += line; - } - - // Then return what we've gotten - if (entry == string.Empty) - { - return null; - } - else - { - return entry; - } - } - - public static int GetNumberOf(string s, string strSearchString) - { - if(strSearchString.Length == 0 || s.Length == 0) - { - return 0; - } - return (s.Length - s.Replace(strSearchString, string.Empty).Length) / strSearchString.Length; - } } } From a9e91e205d15d134924f999cb28772e9d502acb5 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Wed, 13 May 2020 01:44:24 -0700 Subject: [PATCH 05/28] Fix issue with empty unquoted new lines --- .../DataLoadSave/Text/TextLoaderCursor.cs | 25 ++++++++----------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs index e2c34f097b..3e1eb50979 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs @@ -469,29 +469,24 @@ private static class MultiLineReader { public static string ReadMultiLine(TextReader sr) { - string entry = string.Empty; + string line; - // get first bit - entry += sr.ReadLine(); + line = sr.ReadLine(); + if (String.IsNullOrEmpty(line)) // if it was an empty line or we've reached the end of file + return line; + + string entry = line; // And get more lines until the number of quotes is even while (GetNumberOf(entry, "\"") % 2 != 0) { - string line = sr.ReadLine(); + line = sr.ReadLine(); entry += line; - if (line == null) - return entry; + if (line == null) // If we've reached the end of the file + return entry; // MYTODO: This will happen if we have an invalid open quote which never closes, should we throw instead? } - // Then return what we've gotten - if (entry == string.Empty) - { - return null; - } - else - { - return entry; - } + return entry; } public static int GetNumberOf(string s, string strSearchString) From 9cfaee1fa0f146a56c9d526e70e46eab819c3c85 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Wed, 13 May 2020 03:57:36 -0700 Subject: [PATCH 06/28] Make ReadMultiLine a little bit more efficient --- .../DataLoadSave/Text/TextLoaderCursor.cs | 29 +++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs index 3e1eb50979..ae56cb41e7 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs @@ -472,32 +472,37 @@ public static string ReadMultiLine(TextReader sr) string line; line = sr.ReadLine(); - if (String.IsNullOrEmpty(line)) // if it was an empty line or we've reached the end of file + // if it was an empty line or if we've reached the end of file (i.e. line = null) + if (string.IsNullOrEmpty(line)) return line; - string entry = line; + var sb = new StringBuilder(line); - // And get more lines until the number of quotes is even - while (GetNumberOf(entry, "\"") % 2 != 0) + // Get more lines until the number of quote characters is even + long numOfQuotes = GetNumberOfChars(line, '\"'); + while (numOfQuotes % 2 != 0) { line = sr.ReadLine(); - entry += line; + if (line == null) // If we've reached the end of the file - return entry; // MYTODO: This will happen if we have an invalid open quote which never closes, should we throw instead? + break; // MYTODO: This could happen if we have an invalid open quote which never closes so we reach the end of the file without properly closing the field, should we throw instead in this case? + + sb.Append(line); // MYTODO: should we also add a "\n" in here? + numOfQuotes += GetNumberOfChars(line, '\"'); } - return entry; + return sb.ToString(); } - public static int GetNumberOf(string s, string strSearchString) + public static int GetNumberOfChars(string line, char ch) { - if (strSearchString.Length == 0 || s.Length == 0) + int count = 0; + foreach (char c in line) { - return 0; + if (c == ch) count++; } - return (s.Length - s.Replace(strSearchString, string.Empty).Length) / strSearchString.Length; + return count; } - } private void ThreadProc() From 2827c612d2bd176a7111e1b7742edc728e55dcdd Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Wed, 13 May 2020 09:50:16 -0700 Subject: [PATCH 07/28] Add clarifying comment in FetchNextField --- src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs index dd45e8c240..c6953b709d 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs @@ -1161,6 +1161,11 @@ private bool FetchNextField(ref ScanInfo scan, ReadOnlySpan span) scan.QuotingError = true; break; } + + // The logic below allow us to scape quotes (") inside quoted + // fields by using doublo quotes (""). I.e. when the loader + // encounters "" inside a quoted field, it will output only one " + // and continue parsing the rest of the field. if (span[ichCur] == '"') { if (ichCur > ichRun) From a4276620da33473429d71d71a2f0ec5fbb1985c8 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Wed, 13 May 2020 16:31:35 -0700 Subject: [PATCH 08/28] Added test --- .../Common/TextLoader/multiline.csv | 33 +++++++++ test/Microsoft.ML.Tests/TextLoaderTests.cs | 71 ++++++++++++++++++- test/data/multiline.csv | 32 +++++++++ 3 files changed, 135 insertions(+), 1 deletion(-) create mode 100644 test/BaselineOutput/Common/TextLoader/multiline.csv create mode 100644 test/data/multiline.csv diff --git a/test/BaselineOutput/Common/TextLoader/multiline.csv b/test/BaselineOutput/Common/TextLoader/multiline.csv new file mode 100644 index 0000000000..fd4cb55e41 --- /dev/null +++ b/test/BaselineOutput/Common/TextLoader/multiline.csv @@ -0,0 +1,33 @@ +10 +11 +12 +13 +14 +15 +16 +17 +0 +19 +20 +this is a description +this is a quoted description +this is a multiline quoted description +this has one"doublequote which should be escaped as a single quote +this has "doublequotes" inside of it +this is a multiline quoted description with "doublequotes" and empty new lines and escaped quotes inside of "it" //and this comment with // shouldn't be ignored since it is part of the "multiline" +here is text after the empty line +this is a line with an empty animal +this is a line with an empty id + +this is the last row description +dog +cat +bird +dog +cat +bird +dog + +bird +dog +cat \ No newline at end of file diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index 4f6c55dad7..bfb82caacc 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -660,7 +660,7 @@ public class ModelWithColumnNameAttribute } } - public class TextLoaderFromModelTests : BaseTestClass + public class TextLoaderFromModelTests : BaseTestBaseline { public TextLoaderFromModelTests(ITestOutputHelper output) : base(output) @@ -920,5 +920,74 @@ public void TestLoadTextWithoutKeyTypeAttribute() Assert.Equal(expectedCount, data.Schema[1].Type.GetKeyCount()); } + + [Fact] + public void TestLoadTextWithEscapedNewLines() + { + var mlContext = new MLContext(seed: 1); + var dataPath = GetDataPath("multiline.csv"); + var baselinePath = GetBaselinePath("TextLoader", "multiline.csv"); + var data = mlContext.Data.LoadFromTextFile(dataPath, new[] + { + new TextLoader.Column("id", DataKind.Int32, 0), + new TextLoader.Column("description", DataKind.String, 1), + new TextLoader.Column("animal", DataKind.String, 2), + }, + hasHeader: true, separatorChar:',', allowQuoting:true); + + // Get values from loaded dataview + var ids = new List(); + var descriptions = new List(); + var animals = new List(); + using(var curs = data.GetRowCursorForAllColumns()) + { + var idGetter = curs.GetGetter(data.Schema["id"]); + var descriptionGetter = curs.GetGetter>(data.Schema["description"]); + var animalGetter = curs.GetGetter>(data.Schema["animal"]); + + int id = default; + ReadOnlyMemory description = default; + ReadOnlyMemory animal = default; + + while(curs.MoveNext()) + { + idGetter(ref id); + descriptionGetter(ref description); + animalGetter(ref animal); + + ids.Add(id.ToString()); + descriptions.Add(description.ToString()); + animals.Add(animal.ToString()); + } + } + + const int numRows = 11; + Assert.Equal(numRows, ids.Count()); + Assert.Equal(numRows, descriptions.Count()); + Assert.Equal(numRows, animals.Count()); + + // Compare values with baseline file + string line; + using (var file = new StreamReader(baselinePath)) + { + for(int i = 0; i < numRows; i++) + { + line = file.ReadLine(); + Assert.Equal(ids[i], line); + } + + for (int i = 0; i < numRows; i++) + { + line = file.ReadLine(); + Assert.Equal(descriptions[i], line); + } + + for (int i = 0; i < numRows; i++) + { + line = file.ReadLine(); + Assert.Equal(animals[i], line); + } + } + } } } diff --git a/test/data/multiline.csv b/test/data/multiline.csv new file mode 100644 index 0000000000..5d0397f71f --- /dev/null +++ b/test/data/multiline.csv @@ -0,0 +1,32 @@ +// this file should be loaded with quoting enabled +// and it should load without problems by the TextLoader +id,description,animal +// this is a comment that will be ignored +// this is a comment with "quotes" that will also be ignored +// this is a comment with a "quote without close quote that will also be ignored +10,this is a description,dog +11,"this is a quoted description",cat +12,"this is a multiline +quoted description", bird +13,"this has one""doublequote which should be escaped as a single quote",dog +14,"this has ""doublequotes"" inside of it",cat +15, "this is a multiline +quoted description with + +""doublequotes"" and + +empty new lines and + + + +escaped quotes inside of ""it"" + +//and this comment with // shouldn't be ignored +since it is part of the ""multiline""",bird +// this line should be ignored, and the next line is empty: + +16, here is text after the empty line, dog +17, this is a line with an empty animal,"" +"", this is a line with an empty id, bird +19,"",dog +20,this is the last row description,cat \ No newline at end of file From 1cafbf05a5c44d36c6f48e51bc576dc7636e79af Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Wed, 13 May 2020 17:14:49 -0700 Subject: [PATCH 09/28] Added test for column inference --- .../ColumnInferenceTests.cs | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs b/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs index eba7b9b7b3..1c6deb0b31 100644 --- a/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs +++ b/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs @@ -186,5 +186,26 @@ public void InferColumnsColumnInfoParam() Assert.Equal(DefaultColumnNames.Features, result.ColumnInformation.NumericColumnNames.First()); Assert.Null(result.ColumnInformation.ExampleWeightColumnName); } + + [Fact] + public void InferColumnsFromMultilineInputFile() + { + // Check if we can infer the column information + // from and input file which has escaped newlines inside quotes + var dataPath = GetDataPath("multiline.csv"); + MLContext mlContext = new MLContext(); + var inputColumnInformation = new ColumnInformation(); + inputColumnInformation.LabelColumnName = @"id"; + var result = mlContext.Auto().InferColumns(dataPath, inputColumnInformation); + + // File only have 3 columns: "id", "description" and "animal" + Assert.NotNull(result.ColumnInformation.LabelColumnName); + Assert.Equal(1, result.ColumnInformation.TextColumnNames.Count); + Assert.Equal(1, result.ColumnInformation.CategoricalColumnNames.Count); + + Assert.Equal("id", result.ColumnInformation.LabelColumnName); + Assert.Equal("description", result.ColumnInformation.TextColumnNames.First()); + Assert.Equal("animal", result.ColumnInformation.CategoricalColumnNames.First()); + } } } \ No newline at end of file From 6116d97dcad03f5bb5e0d49568e868bd1b76cd5a Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Thu, 14 May 2020 00:27:52 -0700 Subject: [PATCH 10/28] Make multilinereader a little bit more efficient --- .../DataLoadSave/Text/TextLoaderCursor.cs | 36 ++++++++++++++----- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs index ae56cb41e7..6be8341b1d 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs @@ -467,19 +467,35 @@ public LineBatch GetBatch() private static class MultiLineReader { - public static string ReadMultiLine(TextReader sr) + // When reading lines that contain quoted fields, the quoted fields can contain + // '\n' so we we'll need to read multiple lines (multilines) to get all the fields + // of a given row. + public static string ReadMultiLine(TextReader sr, StringBuilder sb, bool ignoreHashLine) { string line; - line = sr.ReadLine(); + // if it was an empty line or if we've reached the end of file (i.e. line = null) if (string.IsNullOrEmpty(line)) return line; - var sb = new StringBuilder(line); + // In ML.NET we filter out lines beginning with // and # at the beginning of the file + // Or lines beginning with // elsewhere in the file. + // Thus, we don't care to check if there's a quoted multiline when the line begins with + // these chars. + if (line[0] == '/' && line[1] == '/') + return line; + if (ignoreHashLine && line[0] == '#') + return line; // Get more lines until the number of quote characters is even - long numOfQuotes = GetNumberOfChars(line, '\"'); + // 2 consecutive quotes are considered scaped quotes + long numOfQuotes = GetNumberOfChars(line, '"'); + if (numOfQuotes % 2 == 0) + return line; + + sb.Clear(); + sb.Append(line); while (numOfQuotes % 2 != 0) { line = sr.ReadLine(); @@ -487,8 +503,11 @@ public static string ReadMultiLine(TextReader sr) if (line == null) // If we've reached the end of the file break; // MYTODO: This could happen if we have an invalid open quote which never closes so we reach the end of the file without properly closing the field, should we throw instead in this case? - sb.Append(line); // MYTODO: should we also add a "\n" in here? - numOfQuotes += GetNumberOfChars(line, '\"'); + if(line.Length != 0) + sb.Append(" "); // MYTODO: should we use instead a "\n" in here to separate lines? + + sb.Append(line); + numOfQuotes += GetNumberOfChars(line, '"'); } return sb.ToString(); @@ -508,6 +527,7 @@ public static int GetNumberOfChars(string line, char ch) private void ThreadProc() { Contracts.Assert(_batchSize >= 2); + var multilineSB = new StringBuilder(); try { @@ -528,7 +548,7 @@ private void ThreadProc() // REVIEW: Avoid allocating a string for every line. This would probably require // introducing a CharSpan type (similar to ReadOnlyMemory but based on char[] or StringBuilder) // and implementing all the necessary conversion functionality on it. See task 3871. - text = MultiLineReader.ReadMultiLine(rdr); + text = MultiLineReader.ReadMultiLine(rdr, multilineSB, true); if (text == null) goto LNext; line++; @@ -555,7 +575,7 @@ private void ThreadProc() if (_abort) return; - text = MultiLineReader.ReadMultiLine(rdr); + text = MultiLineReader.ReadMultiLine(rdr, multilineSB, false); if (text == null) { // We're done with this file. Queue the last partial batch. From df2ca25f31d7678212beb41f64a99f3db11a992c Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Thu, 14 May 2020 02:26:20 -0700 Subject: [PATCH 11/28] Create read multilines parameter --- .../DataLoadSave/Text/TextLoader.cs | 22 +++++++++++--- .../DataLoadSave/Text/TextLoaderCursor.cs | 24 ++++++++++----- .../Text/TextLoaderSaverCatalog.cs | 30 ++++++++++++++----- 3 files changed, 57 insertions(+), 19 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index 2130424513..a33a4ea6c3 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -501,6 +501,13 @@ public class Options [Argument(ArgumentType.AtMostOnce, HelpText = "Use separate parsing threads?", ShortName = "threads", Hide = true)] public bool UseThreads = true; + /// + /// If true, new line characters are acceptable inside a quoted field, and thus one field can have multiple lines of text inside it + /// If is false, this option is ignored. + /// + [Argument(ArgumentType.AtMostOnce, HelpText = "Escape new line characters inside a quoted field? If AllowQuoting is false, this argument is ignored.", ShortName = "multilines", Hide = true)] + public bool ReadMultilines = false; + /// /// File containing a header with feature names. If specified, the header defined in the data file is ignored regardless of . /// @@ -530,6 +537,7 @@ internal static class Defaults internal const char Separator = '\t'; internal const bool HasHeader = false; internal const bool TrimWhitespace = false; + internal const bool ReadMultilines = false; } /// @@ -694,11 +702,11 @@ public Bindings(TextLoader parent, Column[] cols, IMultiStreamSource headerFile, ch.Assert(0 <= inputSize & inputSize < SrcLim); List> lines = null; if (headerFile != null) - Cursor.GetSomeLines(headerFile, 1, ref lines); + Cursor.GetSomeLines(headerFile, 1, false, ref lines); if (needInputSize && inputSize == 0) - Cursor.GetSomeLines(dataSample, 100, ref lines); + Cursor.GetSomeLines(dataSample, 100, parent._readMultilines,ref lines); else if (headerFile == null && parent.HasHeader) - Cursor.GetSomeLines(dataSample, 1, ref lines); + Cursor.GetSomeLines(dataSample, 1, false, ref lines); if (needInputSize && inputSize == 0) { @@ -1081,6 +1089,7 @@ private enum OptionFlags : uint private const int SrcLim = int.MaxValue; private readonly bool _useThreads; + private readonly bool _readMultilines; private readonly OptionFlags _flags; private readonly long _maxRows; // Input size is zero for unknown - determined by the data (including sparse rows). @@ -1138,6 +1147,7 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo _host.Assert(Utils.Size(cols) > 0); _useThreads = options.UseThreads; + _readMultilines = options.AllowQuoting ? options.ReadMultilines : false; if (options.TrimWhitespace) _flags |= OptionFlags.TrimWhitespace; @@ -1350,6 +1360,8 @@ private TextLoader(IHost host, ModelLoadContext ctx) // REVIEW: Should we serialize this? It really isn't part of the data model. _useThreads = true; + // MYTODO: Should we serialize this? probably yes and also include it in Flags + _readMultilines = false; // *** Binary format *** // int: sizeof(Float) @@ -1458,6 +1470,7 @@ internal static TextLoader CreateTextLoader(IHostEnvironment host, bool allowQuoting = Defaults.AllowQuoting, bool supportSparse = Defaults.AllowSparse, bool trimWhitespace = Defaults.TrimWhitespace, + bool readMultilines = Defaults.ReadMultilines, IMultiStreamSource dataSample = null) { var userType = typeof(TInput); @@ -1527,7 +1540,8 @@ internal static TextLoader CreateTextLoader(IHostEnvironment host, AllowQuoting = allowQuoting, AllowSparse = supportSparse, TrimWhitespace = trimWhitespace, - Columns = columns.ToArray() + Columns = columns.ToArray(), + ReadMultilines = readMultilines }; return new TextLoader(host, options, dataSample: dataSample); diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs index 6be8341b1d..335db25905 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs @@ -146,7 +146,7 @@ public static DataViewRowCursor Create(TextLoader parent, IMultiStreamSource fil SetupCursor(parent, active, 0, out srcNeeded, out cthd); Contracts.Assert(cthd > 0); - var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._maxRows, 1); + var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._readMultilines, parent._maxRows, 1); var stats = new ParseStats(parent._host, 1); return new Cursor(parent, stats, active, reader, srcNeeded, cthd); } @@ -163,7 +163,7 @@ public static DataViewRowCursor[] CreateSet(TextLoader parent, IMultiStreamSourc SetupCursor(parent, active, n, out srcNeeded, out cthd); Contracts.Assert(cthd > 0); - var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._maxRows, cthd); + var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._readMultilines, parent._maxRows, cthd); var stats = new ParseStats(parent._host, cthd); if (cthd <= 1) return new DataViewRowCursor[1] { new Cursor(parent, stats, active, reader, srcNeeded, 1) }; @@ -205,7 +205,7 @@ public override ValueGetter GetIdGetter() }; } - public static void GetSomeLines(IMultiStreamSource source, int count, ref List> lines) + public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, ref List> lines) { Contracts.AssertValue(source); Contracts.Assert(count > 0); @@ -215,7 +215,7 @@ public static void GetSomeLines(IMultiStreamSource source, int count, ref List + /// If true, it will accept new line characters inside quoted fields. Thus, enabling that a single field contain + /// multiple lines of text. If allowQuoting is false, this parameter is ignored. /// /// /// + /// If true, it will accept new line characters inside quoted fields. Thus, enabling that a single field contain + /// multiple lines of text. If allowQuoting is false, this parameter is ignored. public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog, char separatorChar = TextLoader.Defaults.Separator, bool hasHeader = TextLoader.Defaults.HasHeader, IMultiStreamSource dataSample = null, bool allowQuoting = TextLoader.Defaults.AllowQuoting, bool trimWhitespace = TextLoader.Defaults.TrimWhitespace, - bool allowSparse = TextLoader.Defaults.AllowSparse) + bool allowSparse = TextLoader.Defaults.AllowSparse, + bool readMultilines = TextLoader.Defaults.ReadMultilines) => TextLoader.CreateTextLoader(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting, - allowSparse, trimWhitespace, dataSample: dataSample); + allowSparse, trimWhitespace, readMultilines, dataSample: dataSample); /// /// Load a from a text file using . @@ -142,6 +149,8 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog cat /// A column may also have dense values followed by sparse values represented in this fashion. For example, /// a row containing "1 2 5 2:6 4:3" represents two dense columns with values 1 and 2, followed by 5 sparsely represented /// columns with values 0, 0, 6, 0, and 3. The indices of the sparse columns start from 0, even though 0 represents the third column. + /// If true, it will accept new line characters inside quoted fields. Thus, enabling that a single field contain + /// multiple lines of text. If allowQuoting is false, this parameter is ignored. /// The data view. public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, string path, @@ -150,7 +159,8 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, bool hasHeader = TextLoader.Defaults.HasHeader, bool allowQuoting = TextLoader.Defaults.AllowQuoting, bool trimWhitespace = TextLoader.Defaults.TrimWhitespace, - bool allowSparse = TextLoader.Defaults.AllowSparse) + bool allowSparse = TextLoader.Defaults.AllowSparse, + bool readMultilines = TextLoader.Defaults.ReadMultilines) { Contracts.CheckNonEmpty(path, nameof(path)); if (!File.Exists(path)) @@ -165,7 +175,8 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, HasHeader = hasHeader, AllowQuoting = allowQuoting, TrimWhitespace = trimWhitespace, - AllowSparse = allowSparse + AllowSparse = allowSparse, + ReadMultilines = readMultilines }; var loader = new TextLoader(CatalogUtils.GetEnvironment(catalog), options: options); @@ -194,6 +205,8 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, /// A column may also have dense values followed by sparse values represented in this fashion. For example, /// a row containing "1 2 5 2:6 4:3" represents two dense columns with values 1 and 2, followed by 5 sparsely represented /// columns with values 0, 0, 6, 0, and 3. The indices of the sparse columns start from 0, even though 0 represents the third column. + /// If true, it will accept new line characters inside quoted fields. Thus, enabling that a single field contain + /// multiple lines of text. If allowQuoting is false, this parameter is ignored. /// The data view. public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, string path, @@ -201,7 +214,8 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog cata bool hasHeader = TextLoader.Defaults.HasHeader, bool allowQuoting = TextLoader.Defaults.AllowQuoting, bool trimWhitespace = TextLoader.Defaults.TrimWhitespace, - bool allowSparse = TextLoader.Defaults.AllowSparse) + bool allowSparse = TextLoader.Defaults.AllowSparse, + bool readMultilines = TextLoader.Defaults.ReadMultilines) { Contracts.CheckNonEmpty(path, nameof(path)); if (!File.Exists(path)) @@ -212,7 +226,7 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog cata // REVIEW: it is almost always a mistake to have a 'trainable' text loader here. // Therefore, we are going to disallow data sample. return TextLoader.CreateTextLoader(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, - allowQuoting, allowSparse, trimWhitespace).Load(new MultiFileSource(path)); + allowQuoting, allowSparse, trimWhitespace, readMultilines).Load(new MultiFileSource(path)); } /// From 530f41e535b63c82c167e4a432f803f951033608 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Thu, 14 May 2020 02:27:02 -0700 Subject: [PATCH 12/28] Make tests run with readMultilines parameter --- .../ColumnInference/ColumnInferenceApi.cs | 6 ++++-- src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs | 3 ++- test/Microsoft.ML.Tests/TextLoaderTests.cs | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs b/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs index ae03511883..91d58ff720 100644 --- a/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs +++ b/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs @@ -56,7 +56,8 @@ public static ColumnInferenceResults InferColumns(MLContext context, string path AllowSparse = splitInference.AllowSparse, AllowQuoting = splitInference.AllowQuote, HasHeader = hasHeader, - TrimWhitespace = trimWhitespace + TrimWhitespace = trimWhitespace, + ReadMultilines = true // MYTODO: is it ok to hardcode this? it is necessary for my test to pass }; var textLoader = context.Data.CreateTextLoader(typedLoaderOptions); var dataView = textLoader.Load(path); @@ -92,7 +93,8 @@ public static ColumnInferenceResults InferColumns(MLContext context, string path AllowSparse = splitInference.AllowSparse, Separators = new char[] { splitInference.Separator.Value }, HasHeader = hasHeader, - TrimWhitespace = trimWhitespace + TrimWhitespace = trimWhitespace, + ReadMultilines = true // is it necessary to put this in here? }; return new ColumnInferenceResults() diff --git a/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs b/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs index fe0066ab6e..db1c166e7a 100644 --- a/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs +++ b/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs @@ -66,7 +66,8 @@ from _sep in separatorCandidates } }, Separators = new[] { perm._sep }, AllowQuoting = perm._allowQuote, - AllowSparse = perm._allowSparse + AllowSparse = perm._allowSparse, + ReadMultilines = true //MYTODO: is it ok to hard code this in here? it's necessary for the test to pass }; if (TryParseFile(context, options, source, out result)) diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index bfb82caacc..a1eb9920ef 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -933,7 +933,7 @@ public void TestLoadTextWithEscapedNewLines() new TextLoader.Column("description", DataKind.String, 1), new TextLoader.Column("animal", DataKind.String, 2), }, - hasHeader: true, separatorChar:',', allowQuoting:true); + hasHeader: true, separatorChar:',', allowQuoting:true, readMultilines: true); // Get values from loaded dataview var ids = new List(); From d9af9d215462223d51fcd053575ab8d9f2204277 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Fri, 15 May 2020 01:25:34 -0700 Subject: [PATCH 13/28] Remove public parameter for readMultilines, as it is considered a breaking API change. --- .../DataLoadSave/Text/TextLoader.cs | 5 +--- .../Text/TextLoaderSaverCatalog.cs | 28 +++++-------------- test/Microsoft.ML.Tests/TextLoaderTests.cs | 12 ++++++-- 3 files changed, 18 insertions(+), 27 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index a33a4ea6c3..c339a2b1c5 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -537,7 +537,6 @@ internal static class Defaults internal const char Separator = '\t'; internal const bool HasHeader = false; internal const bool TrimWhitespace = false; - internal const bool ReadMultilines = false; } /// @@ -1470,7 +1469,6 @@ internal static TextLoader CreateTextLoader(IHostEnvironment host, bool allowQuoting = Defaults.AllowQuoting, bool supportSparse = Defaults.AllowSparse, bool trimWhitespace = Defaults.TrimWhitespace, - bool readMultilines = Defaults.ReadMultilines, IMultiStreamSource dataSample = null) { var userType = typeof(TInput); @@ -1540,8 +1538,7 @@ internal static TextLoader CreateTextLoader(IHostEnvironment host, AllowQuoting = allowQuoting, AllowSparse = supportSparse, TrimWhitespace = trimWhitespace, - Columns = columns.ToArray(), - ReadMultilines = readMultilines + Columns = columns.ToArray() }; return new TextLoader(host, options, dataSample: dataSample); diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs index ef3e891fa8..7c2d86925c 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs @@ -41,8 +41,6 @@ public static class TextLoaderSaverCatalog /// A column may also have dense values followed by sparse values represented in this fashion. For example, /// a row containing "1 2 5 2:6 4:3" represents two dense columns with values 1 and 2, followed by 5 sparsely represented /// columns with values 0, 0, 6, 0, and 3. The indices of the sparse columns start from 0, even though 0 represents the third column. - /// If true, it will accept new line characters inside quoted fields. Thus, enabling that a single field contain - /// multiple lines of text. If allowQuoting is false, this parameter is ignored. /// /// /// - /// If true, it will accept new line characters inside quoted fields. Thus, enabling that a single field contain - /// multiple lines of text. If allowQuoting is false, this parameter is ignored. public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog, char separatorChar = TextLoader.Defaults.Separator, bool hasHeader = TextLoader.Defaults.HasHeader, IMultiStreamSource dataSample = null, bool allowQuoting = TextLoader.Defaults.AllowQuoting, bool trimWhitespace = TextLoader.Defaults.TrimWhitespace, - bool allowSparse = TextLoader.Defaults.AllowSparse, - bool readMultilines = TextLoader.Defaults.ReadMultilines) + bool allowSparse = TextLoader.Defaults.AllowSparse) => TextLoader.CreateTextLoader(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting, - allowSparse, trimWhitespace, readMultilines, dataSample: dataSample); + allowSparse, trimWhitespace, dataSample: dataSample); /// /// Load a from a text file using . @@ -149,8 +142,6 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog cat /// A column may also have dense values followed by sparse values represented in this fashion. For example, /// a row containing "1 2 5 2:6 4:3" represents two dense columns with values 1 and 2, followed by 5 sparsely represented /// columns with values 0, 0, 6, 0, and 3. The indices of the sparse columns start from 0, even though 0 represents the third column. - /// If true, it will accept new line characters inside quoted fields. Thus, enabling that a single field contain - /// multiple lines of text. If allowQuoting is false, this parameter is ignored. /// The data view. public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, string path, @@ -159,8 +150,7 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, bool hasHeader = TextLoader.Defaults.HasHeader, bool allowQuoting = TextLoader.Defaults.AllowQuoting, bool trimWhitespace = TextLoader.Defaults.TrimWhitespace, - bool allowSparse = TextLoader.Defaults.AllowSparse, - bool readMultilines = TextLoader.Defaults.ReadMultilines) + bool allowSparse = TextLoader.Defaults.AllowSparse) { Contracts.CheckNonEmpty(path, nameof(path)); if (!File.Exists(path)) @@ -175,8 +165,7 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, HasHeader = hasHeader, AllowQuoting = allowQuoting, TrimWhitespace = trimWhitespace, - AllowSparse = allowSparse, - ReadMultilines = readMultilines + AllowSparse = allowSparse }; var loader = new TextLoader(CatalogUtils.GetEnvironment(catalog), options: options); @@ -205,8 +194,6 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, /// A column may also have dense values followed by sparse values represented in this fashion. For example, /// a row containing "1 2 5 2:6 4:3" represents two dense columns with values 1 and 2, followed by 5 sparsely represented /// columns with values 0, 0, 6, 0, and 3. The indices of the sparse columns start from 0, even though 0 represents the third column. - /// If true, it will accept new line characters inside quoted fields. Thus, enabling that a single field contain - /// multiple lines of text. If allowQuoting is false, this parameter is ignored. /// The data view. public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, string path, @@ -214,8 +201,7 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog cata bool hasHeader = TextLoader.Defaults.HasHeader, bool allowQuoting = TextLoader.Defaults.AllowQuoting, bool trimWhitespace = TextLoader.Defaults.TrimWhitespace, - bool allowSparse = TextLoader.Defaults.AllowSparse, - bool readMultilines = TextLoader.Defaults.ReadMultilines) + bool allowSparse = TextLoader.Defaults.AllowSparse) { Contracts.CheckNonEmpty(path, nameof(path)); if (!File.Exists(path)) @@ -226,7 +212,7 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog cata // REVIEW: it is almost always a mistake to have a 'trainable' text loader here. // Therefore, we are going to disallow data sample. return TextLoader.CreateTextLoader(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, - allowQuoting, allowSparse, trimWhitespace, readMultilines).Load(new MultiFileSource(path)); + allowQuoting, allowSparse, trimWhitespace).Load(new MultiFileSource(path)); } /// diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index a1eb9920ef..cee06ef868 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -927,13 +927,21 @@ public void TestLoadTextWithEscapedNewLines() var mlContext = new MLContext(seed: 1); var dataPath = GetDataPath("multiline.csv"); var baselinePath = GetBaselinePath("TextLoader", "multiline.csv"); - var data = mlContext.Data.LoadFromTextFile(dataPath, new[] + var options = new TextLoader.Options() + { + HasHeader = true, + Separator = ",", + AllowQuoting = true, + ReadMultilines = true, + Columns = new[] { new TextLoader.Column("id", DataKind.Int32, 0), new TextLoader.Column("description", DataKind.String, 1), new TextLoader.Column("animal", DataKind.String, 2), }, - hasHeader: true, separatorChar:',', allowQuoting:true, readMultilines: true); + }; + + var data = mlContext.Data.LoadFromTextFile(dataPath, options); // Get values from loaded dataview var ids = new List(); From 6a7b6328a0bf2de1f2418bbef3bd353a58b794e5 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Fri, 15 May 2020 01:28:50 -0700 Subject: [PATCH 14/28] Update manifest --- .../Common/EntryPoints/core_manifest.json | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index d139710b10..67033afde3 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -405,6 +405,18 @@ "IsNullable": false, "Default": true }, + { + "Name": "ReadMultilines", + "Type": "Bool", + "Desc": "Escape new line characters inside a quoted field? If AllowQuoting is false, this argument is ignored.", + "Aliases": [ + "multilines" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, { "Name": "HeaderFile", "Type": "String", From fb6ab28cb346325dcc1ee0d289cf3b9fd1714fb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Vel=C3=A1zquez?= <38739674+antoniovs1029@users.noreply.github.com> Date: Fri, 15 May 2020 11:49:01 -0700 Subject: [PATCH 15/28] Update src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs Co-authored-by: Justin Ormont --- src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs index 335db25905..944febe1a5 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs @@ -505,7 +505,7 @@ public static string ReadMultiLine(TextReader sr, StringBuilder sb, bool ignoreH if (line == null) // If we've reached the end of the file break; // MYTODO: This could happen if we have an invalid open quote which never closes so we reach the end of the file without properly closing the field, should we throw instead in this case? - if(line.Length != 0) + if (line.Length != 0) sb.Append(" "); // MYTODO: should we use instead a "\n" in here to separate lines? sb.Append(line); From f592a7fa2a1537cad20bc5e72f43164451f9923b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Vel=C3=A1zquez?= <38739674+antoniovs1029@users.noreply.github.com> Date: Fri, 15 May 2020 11:49:14 -0700 Subject: [PATCH 16/28] Update src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs Co-authored-by: Justin Ormont --- src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs index c6953b709d..13019c4bf2 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs @@ -1162,7 +1162,7 @@ private bool FetchNextField(ref ScanInfo scan, ReadOnlySpan span) break; } - // The logic below allow us to scape quotes (") inside quoted + // The logic below allow us to escape quotes (") inside quoted // fields by using doublo quotes (""). I.e. when the loader // encounters "" inside a quoted field, it will output only one " // and continue parsing the rest of the field. From a13b8030de3c1b5f24042113e32258a6736db61a Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Fri, 15 May 2020 11:59:41 -0700 Subject: [PATCH 17/28] Removed new readMultiline option from AutoML.NET and removed the test that checked ColumnInference --- .../ColumnInference/ColumnInferenceApi.cs | 6 ++---- .../ColumnInference/TextFileContents.cs | 3 +-- .../ColumnInferenceTests.cs | 21 ------------------- 3 files changed, 3 insertions(+), 27 deletions(-) diff --git a/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs b/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs index 91d58ff720..ae03511883 100644 --- a/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs +++ b/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs @@ -56,8 +56,7 @@ public static ColumnInferenceResults InferColumns(MLContext context, string path AllowSparse = splitInference.AllowSparse, AllowQuoting = splitInference.AllowQuote, HasHeader = hasHeader, - TrimWhitespace = trimWhitespace, - ReadMultilines = true // MYTODO: is it ok to hardcode this? it is necessary for my test to pass + TrimWhitespace = trimWhitespace }; var textLoader = context.Data.CreateTextLoader(typedLoaderOptions); var dataView = textLoader.Load(path); @@ -93,8 +92,7 @@ public static ColumnInferenceResults InferColumns(MLContext context, string path AllowSparse = splitInference.AllowSparse, Separators = new char[] { splitInference.Separator.Value }, HasHeader = hasHeader, - TrimWhitespace = trimWhitespace, - ReadMultilines = true // is it necessary to put this in here? + TrimWhitespace = trimWhitespace }; return new ColumnInferenceResults() diff --git a/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs b/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs index db1c166e7a..fe0066ab6e 100644 --- a/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs +++ b/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs @@ -66,8 +66,7 @@ from _sep in separatorCandidates } }, Separators = new[] { perm._sep }, AllowQuoting = perm._allowQuote, - AllowSparse = perm._allowSparse, - ReadMultilines = true //MYTODO: is it ok to hard code this in here? it's necessary for the test to pass + AllowSparse = perm._allowSparse }; if (TryParseFile(context, options, source, out result)) diff --git a/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs b/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs index 1c6deb0b31..eba7b9b7b3 100644 --- a/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs +++ b/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs @@ -186,26 +186,5 @@ public void InferColumnsColumnInfoParam() Assert.Equal(DefaultColumnNames.Features, result.ColumnInformation.NumericColumnNames.First()); Assert.Null(result.ColumnInformation.ExampleWeightColumnName); } - - [Fact] - public void InferColumnsFromMultilineInputFile() - { - // Check if we can infer the column information - // from and input file which has escaped newlines inside quotes - var dataPath = GetDataPath("multiline.csv"); - MLContext mlContext = new MLContext(); - var inputColumnInformation = new ColumnInformation(); - inputColumnInformation.LabelColumnName = @"id"; - var result = mlContext.Auto().InferColumns(dataPath, inputColumnInformation); - - // File only have 3 columns: "id", "description" and "animal" - Assert.NotNull(result.ColumnInformation.LabelColumnName); - Assert.Equal(1, result.ColumnInformation.TextColumnNames.Count); - Assert.Equal(1, result.ColumnInformation.CategoricalColumnNames.Count); - - Assert.Equal("id", result.ColumnInformation.LabelColumnName); - Assert.Equal("description", result.ColumnInformation.TextColumnNames.First()); - Assert.Equal("animal", result.ColumnInformation.CategoricalColumnNames.First()); - } } } \ No newline at end of file From 13033bf9e248569de25f854b81368de18b97a19c Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Sat, 16 May 2020 00:18:45 -0700 Subject: [PATCH 18/28] Actually add new line characters to loaded strings --- test/BaselineOutput/Common/TextLoader/multiline.csv | 4 ++-- test/Microsoft.ML.Tests/TextLoaderTests.cs | 1 + test/data/multiline.csv | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/test/BaselineOutput/Common/TextLoader/multiline.csv b/test/BaselineOutput/Common/TextLoader/multiline.csv index fd4cb55e41..f1232b1c6f 100644 --- a/test/BaselineOutput/Common/TextLoader/multiline.csv +++ b/test/BaselineOutput/Common/TextLoader/multiline.csv @@ -11,10 +11,10 @@ 20 this is a description this is a quoted description -this is a multiline quoted description +this is a multiline\nquoted description this has one"doublequote which should be escaped as a single quote this has "doublequotes" inside of it -this is a multiline quoted description with "doublequotes" and empty new lines and escaped quotes inside of "it" //and this comment with // shouldn't be ignored since it is part of the "multiline" +this is a multiline\nquoted description with\n\n"doublequotes" and\n\nempty new lines and\n\n\n\nescaped quotes inside of "it"\n\n//and this comment with // shouldn't be ignored\nsince it is part of the "multiline" here is text after the empty line this is a line with an empty animal this is a line with an empty id diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index cee06ef868..94d405e306 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -987,6 +987,7 @@ public void TestLoadTextWithEscapedNewLines() for (int i = 0; i < numRows; i++) { line = file.ReadLine(); + line = line.Replace("\\n", "\n"); Assert.Equal(descriptions[i], line); } diff --git a/test/data/multiline.csv b/test/data/multiline.csv index 5d0397f71f..4a22df0854 100644 --- a/test/data/multiline.csv +++ b/test/data/multiline.csv @@ -1,4 +1,4 @@ -// this file should be loaded with quoting enabled +// this file should be loaded with quoting and readmultiline enabled // and it should load without problems by the TextLoader id,description,animal // this is a comment that will be ignored From 2983b06e12791d52cbe3e0773bc6677130927f7f Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Sat, 16 May 2020 01:05:20 -0700 Subject: [PATCH 19/28] Actually include new line characters in loaded strings --- src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs index 944febe1a5..8ed69d4cb2 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs @@ -505,9 +505,7 @@ public static string ReadMultiLine(TextReader sr, StringBuilder sb, bool ignoreH if (line == null) // If we've reached the end of the file break; // MYTODO: This could happen if we have an invalid open quote which never closes so we reach the end of the file without properly closing the field, should we throw instead in this case? - if (line.Length != 0) - sb.Append(" "); // MYTODO: should we use instead a "\n" in here to separate lines? - + sb.Append("\n"); sb.Append(line); numOfQuotes += GetNumberOfChars(line, '"'); } From 9789479c7c4517ee7b8f12135db09d222100dfcc Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Mon, 18 May 2020 13:01:58 -0700 Subject: [PATCH 20/28] Let the TextSaver also correctly save new lines inside quoted fields --- .../DataLoadSave/Text/TextSaver.cs | 2 +- test/Microsoft.ML.Tests/TextLoaderTests.cs | 19 +++++++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextSaver.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextSaver.cs index 0b3208707a..7ad78898fc 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextSaver.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextSaver.cs @@ -827,7 +827,7 @@ internal static void MapText(ReadOnlySpan span, ref StringBuilder sb, char for (; ichCur < ichLim; ichCur++) { char ch = span[ichCur]; - if (ch != '"' && ch != sep && ch != ':') + if (ch != '"' && ch != sep && ch != ':' && ch != '\n') continue; if (!quoted) { diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index 94d405e306..a6a0812545 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -921,8 +921,10 @@ public void TestLoadTextWithoutKeyTypeAttribute() Assert.Equal(expectedCount, data.Schema[1].Type.GetKeyCount()); } - [Fact] - public void TestLoadTextWithEscapedNewLines() + [Theory] + [InlineData(true)] + [InlineData(false)] + public void TestLoadTextWithEscapedNewLines(bool useSaved) { var mlContext = new MLContext(seed: 1); var dataPath = GetDataPath("multiline.csv"); @@ -942,6 +944,19 @@ public void TestLoadTextWithEscapedNewLines() }; var data = mlContext.Data.LoadFromTextFile(dataPath, options); + if (useSaved) + { + // Check that loading the data view from a text file, + // and then saving that data view to another text file, then loading it again + // also matches the baseline. + + var savedPath = DeleteOutputPath("saved-multiline.tsv"); + using (var fs = File.Create(savedPath)) + mlContext.Data.SaveAsText(data, fs, separatorChar: '\t'); + + options.Separator = "\t"; + data = mlContext.Data.LoadFromTextFile(savedPath, options); + } // Get values from loaded dataview var ids = new List(); From 60c416913e784726237b9f228495716983434617 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Mon, 18 May 2020 14:05:07 -0700 Subject: [PATCH 21/28] Fixed bug when calling GetSomeLines --- src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index c339a2b1c5..0946bec904 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -701,11 +701,11 @@ public Bindings(TextLoader parent, Column[] cols, IMultiStreamSource headerFile, ch.Assert(0 <= inputSize & inputSize < SrcLim); List> lines = null; if (headerFile != null) - Cursor.GetSomeLines(headerFile, 1, false, ref lines); + Cursor.GetSomeLines(headerFile, 1, parent._readMultilines, ref lines); if (needInputSize && inputSize == 0) Cursor.GetSomeLines(dataSample, 100, parent._readMultilines,ref lines); else if (headerFile == null && parent.HasHeader) - Cursor.GetSomeLines(dataSample, 1, false, ref lines); + Cursor.GetSomeLines(dataSample, 1, parent._readMultilines, ref lines); if (needInputSize && inputSize == 0) { From fa28ddd4e4be8533d0fc457b49550b0f339056b8 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Mon, 18 May 2020 17:42:55 -0700 Subject: [PATCH 22/28] Throw exception when reaching EOF without ending quote on a given field. --- .../DataLoadSave/Text/TextLoaderCursor.cs | 8 +-- test/Microsoft.ML.Tests/TextLoaderTests.cs | 49 +++++++++++++++++++ 2 files changed, 53 insertions(+), 4 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs index 8ed69d4cb2..bbd44bf0df 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs @@ -472,7 +472,7 @@ private static class MultiLineReader // When reading lines that contain quoted fields, the quoted fields can contain // '\n' so we we'll need to read multiple lines (multilines) to get all the fields // of a given row. - public static string ReadMultiLine(TextReader sr, StringBuilder sb, bool ignoreHashLine) + public static string ReadMultiLine(TextReader sr, StringBuilder sb, long lineNum, bool ignoreHashLine) { string line; line = sr.ReadLine(); @@ -503,7 +503,7 @@ public static string ReadMultiLine(TextReader sr, StringBuilder sb, bool ignoreH line = sr.ReadLine(); if (line == null) // If we've reached the end of the file - break; // MYTODO: This could happen if we have an invalid open quote which never closes so we reach the end of the file without properly closing the field, should we throw instead in this case? + throw new EndOfStreamException($"A quote opened on a field on line {lineNum} was never closed, and we've read to the last line in the file without finding the closing quote"); sb.Append("\n"); sb.Append(line); @@ -549,7 +549,7 @@ private void ThreadProc() // introducing a CharSpan type (similar to ReadOnlyMemory but based on char[] or StringBuilder) // and implementing all the necessary conversion functionality on it. See task 3871. if (_readMultilines) - text = MultiLineReader.ReadMultiLine(rdr, multilineSB, true); + text = MultiLineReader.ReadMultiLine(rdr, multilineSB, line, true); else text = rdr.ReadLine(); @@ -580,7 +580,7 @@ private void ThreadProc() return; if (_readMultilines) - text = MultiLineReader.ReadMultiLine(rdr, multilineSB, true); + text = MultiLineReader.ReadMultiLine(rdr, multilineSB, line, false); else text = rdr.ReadLine(); diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index a6a0812545..59f58d417d 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -1013,5 +1013,54 @@ public void TestLoadTextWithEscapedNewLines(bool useSaved) } } } + + [Fact] + public void TestInvalidMultilineCSVQuote() + { + var mlContext = new MLContext(seed: 1); + + string badInputCsv = + "id,description,animal\n" + + "9,\"this is a quoted field correctly formatted\",cat\n" + + "10,\"this is a quoted field\nwithout closing quote,cat\n" + + "11,this field isn't quoted,dog\n" + + "12,this will reach the end of the file without finding a closing quote so it will throw,frog\n" + ; + + var filePath = GetOutputPath("multiline-invalid.csv"); + File.WriteAllText(filePath, badInputCsv); + + bool threwException = false; + try + { + var options = new TextLoader.Options() + { + HasHeader = true, + Separator = ",", + AllowQuoting = true, + ReadMultilines = true, + Columns = new[] + { + new TextLoader.Column("id", DataKind.Int32, 0), + new TextLoader.Column("description", DataKind.String, 1), + new TextLoader.Column("animal", DataKind.String, 2), + }, + }; + + var data = mlContext.Data.LoadFromTextFile(filePath, options); + + data.Preview(); + } + catch(EndOfStreamException) + { + threwException = true; + } + catch(FormatException) + { + threwException = true; + } + + Assert.True(threwException, "Invalid file should have thrown an exception"); + } } } From 56279b40c11cc0a3ed617e14892c332c0fc9ee6f Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Mon, 18 May 2020 19:17:29 -0700 Subject: [PATCH 23/28] Refactored readMultilines into OptionsFlags --- .../DataLoadSave/Text/TextLoader.cs | 23 +++++++++++-------- .../DataLoadSave/Text/TextLoaderCursor.cs | 4 ++-- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index 0946bec904..059a8a9bf1 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -701,11 +701,11 @@ public Bindings(TextLoader parent, Column[] cols, IMultiStreamSource headerFile, ch.Assert(0 <= inputSize & inputSize < SrcLim); List> lines = null; if (headerFile != null) - Cursor.GetSomeLines(headerFile, 1, parent._readMultilines, ref lines); + Cursor.GetSomeLines(headerFile, 1, parent.ReadMultilines, ref lines); if (needInputSize && inputSize == 0) - Cursor.GetSomeLines(dataSample, 100, parent._readMultilines,ref lines); + Cursor.GetSomeLines(dataSample, 100, parent.ReadMultilines, ref lines); else if (headerFile == null && parent.HasHeader) - Cursor.GetSomeLines(dataSample, 1, parent._readMultilines, ref lines); + Cursor.GetSomeLines(dataSample, 1, parent.ReadMultilines, ref lines); if (needInputSize && inputSize == 0) { @@ -1062,7 +1062,7 @@ private static VersionInfo GetVersionInfo() // verWrittenCur: 0x00010009, // Introduced _flags //verWrittenCur: 0x0001000A, // Added ForceVector in Range //verWrittenCur: 0x0001000B, // Header now retained if used and present - verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType + verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags verReadableCur: 0x0001000A, verWeCanReadBack: 0x00010009, loaderSignature: LoaderSignature, @@ -1080,15 +1080,14 @@ private enum OptionFlags : uint HasHeader = 0x02, AllowQuoting = 0x04, AllowSparse = 0x08, - - All = TrimWhitespace | HasHeader | AllowQuoting | AllowSparse + ReadMultilines = 0x10, + All = TrimWhitespace | HasHeader | AllowQuoting | AllowSparse | ReadMultilines } // This is reserved to mean the range extends to the end (the segment is variable). private const int SrcLim = int.MaxValue; private readonly bool _useThreads; - private readonly bool _readMultilines; private readonly OptionFlags _flags; private readonly long _maxRows; // Input size is zero for unknown - determined by the data (including sparse rows). @@ -1103,6 +1102,11 @@ private bool HasHeader get { return (_flags & OptionFlags.HasHeader) != 0; } } + private bool ReadMultilines + { + get { return (_flags & OptionFlags.ReadMultilines) != 0; } + } + private readonly IHost _host; private const string RegistrationName = "TextLoader"; @@ -1146,7 +1150,6 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo _host.Assert(Utils.Size(cols) > 0); _useThreads = options.UseThreads; - _readMultilines = options.AllowQuoting ? options.ReadMultilines : false; if (options.TrimWhitespace) _flags |= OptionFlags.TrimWhitespace; @@ -1156,6 +1159,8 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo _flags |= OptionFlags.AllowQuoting; if (options.AllowSparse) _flags |= OptionFlags.AllowSparse; + if (options.AllowQuoting && options.ReadMultilines) + _flags |= OptionFlags.ReadMultilines; // REVIEW: This should be persisted (if it should be maintained). _maxRows = options.MaxRows ?? long.MaxValue; @@ -1359,8 +1364,6 @@ private TextLoader(IHost host, ModelLoadContext ctx) // REVIEW: Should we serialize this? It really isn't part of the data model. _useThreads = true; - // MYTODO: Should we serialize this? probably yes and also include it in Flags - _readMultilines = false; // *** Binary format *** // int: sizeof(Float) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs index bbd44bf0df..f8279cdc2e 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs @@ -146,7 +146,7 @@ public static DataViewRowCursor Create(TextLoader parent, IMultiStreamSource fil SetupCursor(parent, active, 0, out srcNeeded, out cthd); Contracts.Assert(cthd > 0); - var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._readMultilines, parent._maxRows, 1); + var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._maxRows, 1); var stats = new ParseStats(parent._host, 1); return new Cursor(parent, stats, active, reader, srcNeeded, cthd); } @@ -163,7 +163,7 @@ public static DataViewRowCursor[] CreateSet(TextLoader parent, IMultiStreamSourc SetupCursor(parent, active, n, out srcNeeded, out cthd); Contracts.Assert(cthd > 0); - var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._readMultilines, parent._maxRows, cthd); + var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._maxRows, cthd); var stats = new ParseStats(parent._host, cthd); if (cthd <= 1) return new DataViewRowCursor[1] { new Cursor(parent, stats, active, reader, srcNeeded, 1) }; From 51d9390f04ef74da50ba6f5204c483cb33a05439 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Mon, 18 May 2020 19:18:10 -0700 Subject: [PATCH 24/28] Added test to check new readMultiline option from MAML. --- ...avePipeTextLoaderWithMoreOptions-1-out.txt | 33 +++++++++++++++++++ .../SavePipeTextLoaderWithMoreOptions-out.txt | 33 +++++++++++++++++++ .../TestCommandBase.cs | 19 ++++++++++- 3 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMoreOptions-1-out.txt create mode 100644 test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMoreOptions-out.txt diff --git a/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMoreOptions-1-out.txt b/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMoreOptions-1-out.txt new file mode 100644 index 0000000000..e048de1eff --- /dev/null +++ b/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMoreOptions-1-out.txt @@ -0,0 +1,33 @@ +#@ TextLoader{ +#@ header+ +#@ sep=tab +#@ col=id:R4:0 +#@ col=description:TX:1 +#@ col=animal:TX:2 +#@ } +id description animal +10 this is a description dog +11 this is a quoted description cat +12 "this is a multiline +quoted description" bird +13 "this has one""doublequote which should be escaped as a single quote" dog +14 "this has ""doublequotes"" inside of it" cat +15 "this is a multiline +quoted description with + +""doublequotes"" and + +empty new lines and + + + +escaped quotes inside of ""it"" + +//and this comment with // shouldn't be ignored +since it is part of the ""multiline""" bird +16 here is text after the empty line dog +17 this is a line with an empty animal "" +0 this is a line with an empty id bird +19 "" dog +20 this is the last row description cat +Wrote 11 rows of length 3 \ No newline at end of file diff --git a/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMoreOptions-out.txt b/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMoreOptions-out.txt new file mode 100644 index 0000000000..e048de1eff --- /dev/null +++ b/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMoreOptions-out.txt @@ -0,0 +1,33 @@ +#@ TextLoader{ +#@ header+ +#@ sep=tab +#@ col=id:R4:0 +#@ col=description:TX:1 +#@ col=animal:TX:2 +#@ } +id description animal +10 this is a description dog +11 this is a quoted description cat +12 "this is a multiline +quoted description" bird +13 "this has one""doublequote which should be escaped as a single quote" dog +14 "this has ""doublequotes"" inside of it" cat +15 "this is a multiline +quoted description with + +""doublequotes"" and + +empty new lines and + + + +escaped quotes inside of ""it"" + +//and this comment with // shouldn't be ignored +since it is part of the ""multiline""" bird +16 here is text after the empty line dog +17 this is a line with an empty animal "" +0 this is a line with an empty id bird +19 "" dog +20 this is the last row description cat +Wrote 11 rows of length 3 \ No newline at end of file diff --git a/test/Microsoft.ML.TestFramework/TestCommandBase.cs b/test/Microsoft.ML.TestFramework/TestCommandBase.cs index 425fbab697..9e2e77b2de 100644 --- a/test/Microsoft.ML.TestFramework/TestCommandBase.cs +++ b/test/Microsoft.ML.TestFramework/TestCommandBase.cs @@ -2128,7 +2128,7 @@ public void Datatypes() public void SavePipeChooseColumnsByIndex() { string dataPath = GetDataPath("adult.tiny.with-schema.txt"); - const string loaderArgs = "loader=text{header+ col=Label:0 col=Cat:TX:1-8 col=Num:9-14 col=Name:TX:9}"; + const string loaderArgs = "loader=text{header+ multilines+ quote+ col=Label:0 col=Cat:TX:1-8 col=Num:9-14 col=Name:TX:9}"; OutputPath modelPath = ModelPath(); string extraArgs = "xf=ChooseColumnsByIndex{ind=3 ind=0}"; @@ -2140,6 +2140,23 @@ public void SavePipeChooseColumnsByIndex() Done(); } + [TestCategory("DataPipeSerialization")] + [Fact()] + public void SavePipeTextLoaderWithMoreOptions() + { + string dataPath = GetDataPath("multiline.csv"); + const string loaderArgs = "loader=text{sep=, quote+ multilines+ header+ col=id:Num:0 col=description:TX:1 col=animal:TX:2}"; + + OutputPath modelPath = ModelPath(); + string extraArgs = null; + TestCore("showdata", dataPath, loaderArgs, extraArgs); + + _step++; + + TestCore("showdata", dataPath, string.Format("in={{{0}}}", modelPath.Path), ""); + Done(); + } + [TestCategory("DataPipeSerialization")] [Fact()] public void SavePipeChooseColumnsByIndexDrop() From 5be5f6f0e65372e514e4b1832919aa99f4386214 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Mon, 18 May 2020 19:26:28 -0700 Subject: [PATCH 25/28] Fixed mistake on an unrelated test --- test/Microsoft.ML.TestFramework/TestCommandBase.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Microsoft.ML.TestFramework/TestCommandBase.cs b/test/Microsoft.ML.TestFramework/TestCommandBase.cs index 44dd8b9cb8..b7c286f3a4 100644 --- a/test/Microsoft.ML.TestFramework/TestCommandBase.cs +++ b/test/Microsoft.ML.TestFramework/TestCommandBase.cs @@ -2138,7 +2138,7 @@ public void Datatypes() public void SavePipeChooseColumnsByIndex() { string dataPath = GetDataPath("adult.tiny.with-schema.txt"); - const string loaderArgs = "loader=text{header+ multilines+ quote+ col=Label:0 col=Cat:TX:1-8 col=Num:9-14 col=Name:TX:9}"; + const string loaderArgs = "loader=text{header+ col=Label:0 col=Cat:TX:1-8 col=Num:9-14 col=Name:TX:9}"; OutputPath modelPath = ModelPath(); string extraArgs = "xf=ChooseColumnsByIndex{ind=3 ind=0}"; From b4e3029a3af29ac076f6b0b4cbd93cb3476fbc83 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Mon, 18 May 2020 23:01:12 -0700 Subject: [PATCH 26/28] Added internal default to ReadMultilines --- src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index 4bab5c53a4..b7a6c8cf66 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -493,7 +493,7 @@ public class Options /// [Argument(ArgumentType.AtMostOnce, ShortName = "header", HelpText = "Data file has header with feature names. Header is read only if options 'hs' and 'hf' are not specified.")] - public bool HasHeader; + public bool HasHeader = Defaults.HasHeader; /// /// Whether to use separate parsing threads. @@ -506,7 +506,7 @@ public class Options /// If is false, this option is ignored. /// [Argument(ArgumentType.AtMostOnce, HelpText = "Escape new line characters inside a quoted field? If AllowQuoting is false, this argument is ignored.", ShortName = "multilines", Hide = true)] - public bool ReadMultilines = false; + public bool ReadMultilines = Defaults.ReadMultilines; /// /// File containing a header with feature names. If specified, the header defined in the data file is ignored regardless of . @@ -537,6 +537,7 @@ internal static class Defaults internal const char Separator = '\t'; internal const bool HasHeader = false; internal const bool TrimWhitespace = false; + internal const bool ReadMultilines = false; } /// From 7e16fc7a66486b4b12ce43e56d6f607cb7f413ea Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Tue, 19 May 2020 02:23:05 -0700 Subject: [PATCH 27/28] Do more checking in MultilineReader in order to be more flexible in accepting invalid formats --- .../DataLoadSave/Text/TextLoader.cs | 6 +- .../DataLoadSave/Text/TextLoaderCursor.cs | 175 +++++++++++++++--- 2 files changed, 148 insertions(+), 33 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index b7a6c8cf66..7ea6ab17e9 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -702,11 +702,11 @@ public Bindings(TextLoader parent, Column[] cols, IMultiStreamSource headerFile, ch.Assert(0 <= inputSize & inputSize < SrcLim); List> lines = null; if (headerFile != null) - Cursor.GetSomeLines(headerFile, 1, parent.ReadMultilines, ref lines); + Cursor.GetSomeLines(headerFile, 1, parent.ReadMultilines, parent._separators, ref lines); if (needInputSize && inputSize == 0) - Cursor.GetSomeLines(dataSample, 100, parent.ReadMultilines, ref lines); + Cursor.GetSomeLines(dataSample, 100, parent.ReadMultilines, parent._separators, ref lines); else if (headerFile == null && parent.HasHeader) - Cursor.GetSomeLines(dataSample, 1, parent.ReadMultilines, ref lines); + Cursor.GetSomeLines(dataSample, 1, parent.ReadMultilines, parent._separators, ref lines); if (needInputSize && inputSize == 0) { diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs index f8279cdc2e..62f5709169 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs @@ -146,7 +146,7 @@ public static DataViewRowCursor Create(TextLoader parent, IMultiStreamSource fil SetupCursor(parent, active, 0, out srcNeeded, out cthd); Contracts.Assert(cthd > 0); - var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._maxRows, 1); + var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, 1); var stats = new ParseStats(parent._host, 1); return new Cursor(parent, stats, active, reader, srcNeeded, cthd); } @@ -163,7 +163,7 @@ public static DataViewRowCursor[] CreateSet(TextLoader parent, IMultiStreamSourc SetupCursor(parent, active, n, out srcNeeded, out cthd); Contracts.Assert(cthd > 0); - var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._maxRows, cthd); + var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, cthd); var stats = new ParseStats(parent._host, cthd); if (cthd <= 1) return new DataViewRowCursor[1] { new Cursor(parent, stats, active, reader, srcNeeded, 1) }; @@ -205,7 +205,7 @@ public override ValueGetter GetIdGetter() }; } - public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, ref List> lines) + public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, char[] separators, ref List> lines) { Contracts.AssertValue(source); Contracts.Assert(count > 0); @@ -215,7 +215,7 @@ public static void GetSomeLines(IMultiStreamSource source, int count, bool readM count = 2; LineBatch batch; - var reader = new LineReader(source, count, 1, false, readMultilines, count, 1); + var reader = new LineReader(source, count, 1, false, readMultilines, separators, count, 1); try { batch = reader.GetBatch(); @@ -403,6 +403,7 @@ private sealed class LineReader private readonly long _limit; private readonly bool _hasHeader; private readonly bool _readMultilines; + private readonly char[] _separators; private readonly int _batchSize; private readonly IMultiStreamSource _files; @@ -412,7 +413,7 @@ private sealed class LineReader private Task _thdRead; private volatile bool _abort; - public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, long limit, int cref) + public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, char[] separators, long limit, int cref) { // Note that files is allowed to be empty. Contracts.AssertValue(files); @@ -426,6 +427,7 @@ public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool has _hasHeader = hasHeader; _batchSize = batchSize; _readMultilines = readMultilines; + _separators = separators; _files = files; _cref = cref; @@ -467,15 +469,31 @@ public LineBatch GetBatch() throw Contracts.ExceptDecode(batch.Exception, "Stream reading encountered exception"); } - private static class MultiLineReader + private class MultiLineReader { + private readonly char _sep0; + private readonly char[] _separators; + private readonly bool _sepsContainsSpace; + private readonly StringBuilder _sb; + private readonly TextReader _rdr; + + public MultiLineReader(TextReader rdr, char[] separators) + { + Contracts.AssertNonEmpty(separators); + _sep0 = separators[0]; + _separators = separators; + _sepsContainsSpace = IsSep(' '); + _sb = new StringBuilder(); + _rdr = rdr; + } + // When reading lines that contain quoted fields, the quoted fields can contain // '\n' so we we'll need to read multiple lines (multilines) to get all the fields // of a given row. - public static string ReadMultiLine(TextReader sr, StringBuilder sb, long lineNum, bool ignoreHashLine) + public string ReadMultiLine(long lineNum, bool ignoreHashLine) { string line; - line = sr.ReadLine(); + line = _rdr.ReadLine(); // if it was an empty line or if we've reached the end of file (i.e. line = null) if (string.IsNullOrEmpty(line)) @@ -490,44 +508,140 @@ public static string ReadMultiLine(TextReader sr, StringBuilder sb, long lineNum if (ignoreHashLine && line[0] == '#') return line; - // Get more lines until the number of quote characters is even - // 2 consecutive quotes are considered scaped quotes - long numOfQuotes = GetNumberOfChars(line, '"'); - if (numOfQuotes % 2 == 0) + // Get more lines until the last field of the line doesn't contain its newline + // inside a quoted field + bool lastFieldIncludesNewLine = LastFieldIncludesNewLine(line, false); + if (!lastFieldIncludesNewLine) return line; - sb.Clear(); - sb.Append(line); - while (numOfQuotes % 2 != 0) + _sb.Clear(); + _sb.Append(line); + while (lastFieldIncludesNewLine) + { + line = _rdr.ReadLine(); + + if (line == null) + throw new EndOfStreamException($"A quoted field opened on line {lineNum} was never closed, and we've read to the last line in the file without finding the closing quote"); + + _sb.Append("\n"); + _sb.Append(line); + lastFieldIncludesNewLine = LastFieldIncludesNewLine(line, true); + } + + return _sb.ToString(); + } + + // The startsInsideQuoted parameter indicates if the last field of the previous line + // ended in a quoted field which included the newline character, + // if it is true, then the beginning of this line is considered to be part + // of the last field of the previous line. + public bool LastFieldIncludesNewLine(string line, bool startsInsideQuoted = false) + { + if (line.Length == 0) + return startsInsideQuoted; + + int ichCur = 0; + int ichLim = line.Length; + bool quotingError = false; + + bool ret = FieldIncludesNewLine(ref line, ref ichCur, ichLim, ref quotingError, startsInsideQuoted); + while (ichCur < ichLim) { - line = sr.ReadLine(); + ret = FieldIncludesNewLine(ref line, ref ichCur, ichLim, ref quotingError, false); + if(quotingError) + return false; + + // Skip empty fields + while (ichCur < ichLim && IsSep(line[ichCur])) + ichCur++; + } - if (line == null) // If we've reached the end of the file - throw new EndOfStreamException($"A quote opened on a field on line {lineNum} was never closed, and we've read to the last line in the file without finding the closing quote"); + return ret; + } - sb.Append("\n"); - sb.Append(line); - numOfQuotes += GetNumberOfChars(line, '"'); + private bool FieldIncludesNewLine(ref string line, ref int ichCur, int ichLim, + ref bool quotingError, bool startsInsideQuoted) + { + if (!startsInsideQuoted && !_sepsContainsSpace) + { + // Ignore leading spaces + while (ichCur < ichLim && line[ichCur] == ' ') + ichCur++; } - return sb.ToString(); + if(startsInsideQuoted || line[ichCur] == '"') + { + // Quoted Field Case + + if (!startsInsideQuoted) + ichCur++; + + for (; ; ichCur++) + { + if (ichCur >= ichLim) + // We've reached the end of the line without finding the closing quote, + // so next line will start on this quoted field + return true; + + if (line[ichCur] == '"') + { + if (++ichCur >= ichLim) + // Last character in line was the closing quote of the field + return false; + + if (line[ichCur] == '"') + // 2 Double quotes means escaped quote + continue; + + // If it wasn't an escaped quote, then this is supposed to be + // the closing quote of the field, and there should only be spaces remaining + // until the next separator. + + if (!_sepsContainsSpace) + { + // Ignore leading spaces + while (ichCur < ichLim && line[ichCur] == ' ') + ichCur++; + } + + // If there's anything else than spaces or the next separator, + // this will actually be a QuotingError on the parser, so we decide that this + // line contains a quoting error, and so it's not going to be considered a valid field + // and the rest of the line should be ignored. + if (ichCur >= ichLim || IsSep(line[ichCur])) + return false; + + quotingError = true; + return false; + } + } + } + + // Unquoted field case. + // An unquoted field shouldn't contain new lines + while(ichCur < ichLim && !IsSep(line[ichCur])) + { + ichCur++; + } + return false; } - public static int GetNumberOfChars(string line, char ch) + private bool IsSep(char ch) { - int count = 0; - foreach (char c in line) + if (ch == _sep0) + return true; + for (int i = 1; i < _separators.Length; i++) { - if (c == ch) count++; + if (ch == _separators[i]) + return true; } - return count; + return false; } } private void ThreadProc() { Contracts.Assert(_batchSize >= 2); - var multilineSB = new StringBuilder(); try { @@ -541,6 +655,7 @@ private void ThreadProc() string path = _files.GetPathOrNull(ifile); using (var rdr = _files.OpenTextReader(ifile)) { + var multilineReader = new MultiLineReader(rdr, _separators); string text; long line = 0; for (; ; ) @@ -549,7 +664,7 @@ private void ThreadProc() // introducing a CharSpan type (similar to ReadOnlyMemory but based on char[] or StringBuilder) // and implementing all the necessary conversion functionality on it. See task 3871. if (_readMultilines) - text = MultiLineReader.ReadMultiLine(rdr, multilineSB, line, true); + text = multilineReader.ReadMultiLine(line, true); else text = rdr.ReadLine(); @@ -580,7 +695,7 @@ private void ThreadProc() return; if (_readMultilines) - text = MultiLineReader.ReadMultiLine(rdr, multilineSB, line, false); + text = multilineReader.ReadMultiLine(line, false); else text = rdr.ReadLine(); From f0652a58b072a124ffc16e01f43124362502f584 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Tue, 19 May 2020 02:23:38 -0700 Subject: [PATCH 28/28] Updated tests --- ...out.txt => SavePipeTextLoaderWithMultilines-1-out.txt} | 6 ++++-- ...s-out.txt => SavePipeTextLoaderWithMultilines-out.txt} | 6 ++++-- test/BaselineOutput/Common/TextLoader/multiline.csv | 8 +++++++- test/Microsoft.ML.TestFramework/TestCommandBase.cs | 2 +- test/Microsoft.ML.Tests/TextLoaderTests.cs | 2 +- test/data/multiline.csv | 4 +++- 6 files changed, 20 insertions(+), 8 deletions(-) rename test/BaselineOutput/Common/Command/{SavePipeTextLoaderWithMoreOptions-1-out.txt => SavePipeTextLoaderWithMultilines-1-out.txt} (82%) rename test/BaselineOutput/Common/Command/{SavePipeTextLoaderWithMoreOptions-out.txt => SavePipeTextLoaderWithMultilines-out.txt} (82%) diff --git a/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMoreOptions-1-out.txt b/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMultilines-1-out.txt similarity index 82% rename from test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMoreOptions-1-out.txt rename to test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMultilines-1-out.txt index e048de1eff..64cdbe8894 100644 --- a/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMoreOptions-1-out.txt +++ b/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMultilines-1-out.txt @@ -29,5 +29,7 @@ since it is part of the ""multiline""" bird 17 this is a line with an empty animal "" 0 this is a line with an empty id bird 19 "" dog -20 this is the last row description cat -Wrote 11 rows of length 3 \ No newline at end of file +20 "we also allow""quotes in the middle of fields" cat +21 "or also at the end""" bird +22 this is the last row description dog +Wrote 13 rows of length 3 diff --git a/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMoreOptions-out.txt b/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMultilines-out.txt similarity index 82% rename from test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMoreOptions-out.txt rename to test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMultilines-out.txt index e048de1eff..64cdbe8894 100644 --- a/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMoreOptions-out.txt +++ b/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMultilines-out.txt @@ -29,5 +29,7 @@ since it is part of the ""multiline""" bird 17 this is a line with an empty animal "" 0 this is a line with an empty id bird 19 "" dog -20 this is the last row description cat -Wrote 11 rows of length 3 \ No newline at end of file +20 "we also allow""quotes in the middle of fields" cat +21 "or also at the end""" bird +22 this is the last row description dog +Wrote 13 rows of length 3 diff --git a/test/BaselineOutput/Common/TextLoader/multiline.csv b/test/BaselineOutput/Common/TextLoader/multiline.csv index f1232b1c6f..4452061d56 100644 --- a/test/BaselineOutput/Common/TextLoader/multiline.csv +++ b/test/BaselineOutput/Common/TextLoader/multiline.csv @@ -9,6 +9,8 @@ 0 19 20 +21 +22 this is a description this is a quoted description this is a multiline\nquoted description @@ -19,6 +21,8 @@ here is text after the empty line this is a line with an empty animal this is a line with an empty id +we also allow"quotes in the middle of fields +or also at the end" this is the last row description dog cat @@ -30,4 +34,6 @@ dog bird dog -cat \ No newline at end of file +cat +bird +dog \ No newline at end of file diff --git a/test/Microsoft.ML.TestFramework/TestCommandBase.cs b/test/Microsoft.ML.TestFramework/TestCommandBase.cs index b7c286f3a4..6e5745d932 100644 --- a/test/Microsoft.ML.TestFramework/TestCommandBase.cs +++ b/test/Microsoft.ML.TestFramework/TestCommandBase.cs @@ -2152,7 +2152,7 @@ public void SavePipeChooseColumnsByIndex() [TestCategory("DataPipeSerialization")] [Fact()] - public void SavePipeTextLoaderWithMoreOptions() + public void SavePipeTextLoaderWithMultilines() { string dataPath = GetDataPath("multiline.csv"); const string loaderArgs = "loader=text{sep=, quote+ multilines+ header+ col=id:Num:0 col=description:TX:1 col=animal:TX:2}"; diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index 4ae83eebc5..b2421bacce 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -1001,7 +1001,7 @@ public void TestLoadTextWithEscapedNewLines(bool useSaved) } } - const int numRows = 11; + const int numRows = 13; Assert.Equal(numRows, ids.Count()); Assert.Equal(numRows, descriptions.Count()); Assert.Equal(numRows, animals.Count()); diff --git a/test/data/multiline.csv b/test/data/multiline.csv index 4a22df0854..2defac230a 100644 --- a/test/data/multiline.csv +++ b/test/data/multiline.csv @@ -29,4 +29,6 @@ since it is part of the ""multiline""",bird 17, this is a line with an empty animal,"" "", this is a line with an empty id, bird 19,"",dog -20,this is the last row description,cat \ No newline at end of file +20,we also allow"quotes in the middle of fields,cat +21,or also at the end",bird +22,this is the last row description,dog \ No newline at end of file