diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index aa173e8a59..7ea6ab17e9 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -493,7 +493,7 @@ public class Options /// [Argument(ArgumentType.AtMostOnce, ShortName = "header", HelpText = "Data file has header with feature names. Header is read only if options 'hs' and 'hf' are not specified.")] - public bool HasHeader; + public bool HasHeader = Defaults.HasHeader; /// /// Whether to use separate parsing threads. @@ -501,6 +501,13 @@ public class Options [Argument(ArgumentType.AtMostOnce, HelpText = "Use separate parsing threads?", ShortName = "threads", Hide = true)] public bool UseThreads = true; + /// + /// If true, new line characters are acceptable inside a quoted field, and thus one field can have multiple lines of text inside it + /// If is false, this option is ignored. + /// + [Argument(ArgumentType.AtMostOnce, HelpText = "Escape new line characters inside a quoted field? If AllowQuoting is false, this argument is ignored.", ShortName = "multilines", Hide = true)] + public bool ReadMultilines = Defaults.ReadMultilines; + /// /// File containing a header with feature names. If specified, the header defined in the data file is ignored regardless of . /// @@ -530,6 +537,7 @@ internal static class Defaults internal const char Separator = '\t'; internal const bool HasHeader = false; internal const bool TrimWhitespace = false; + internal const bool ReadMultilines = false; } /// @@ -694,11 +702,11 @@ public Bindings(TextLoader parent, Column[] cols, IMultiStreamSource headerFile, ch.Assert(0 <= inputSize & inputSize < SrcLim); List> lines = null; if (headerFile != null) - Cursor.GetSomeLines(headerFile, 1, ref lines); + Cursor.GetSomeLines(headerFile, 1, parent.ReadMultilines, parent._separators, ref lines); if (needInputSize && inputSize == 0) - Cursor.GetSomeLines(dataSample, 100, ref lines); + Cursor.GetSomeLines(dataSample, 100, parent.ReadMultilines, parent._separators, ref lines); else if (headerFile == null && parent.HasHeader) - Cursor.GetSomeLines(dataSample, 1, ref lines); + Cursor.GetSomeLines(dataSample, 1, parent.ReadMultilines, parent._separators, ref lines); if (needInputSize && inputSize == 0) { @@ -1055,7 +1063,7 @@ private static VersionInfo GetVersionInfo() // verWrittenCur: 0x00010009, // Introduced _flags //verWrittenCur: 0x0001000A, // Added ForceVector in Range //verWrittenCur: 0x0001000B, // Header now retained if used and present - verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType + verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags verReadableCur: 0x0001000A, verWeCanReadBack: 0x00010009, loaderSignature: LoaderSignature, @@ -1073,8 +1081,8 @@ private enum OptionFlags : uint HasHeader = 0x02, AllowQuoting = 0x04, AllowSparse = 0x08, - - All = TrimWhitespace | HasHeader | AllowQuoting | AllowSparse + ReadMultilines = 0x10, + All = TrimWhitespace | HasHeader | AllowQuoting | AllowSparse | ReadMultilines } // This is reserved to mean the range extends to the end (the segment is variable). @@ -1095,6 +1103,11 @@ private bool HasHeader get { return (_flags & OptionFlags.HasHeader) != 0; } } + private bool ReadMultilines + { + get { return (_flags & OptionFlags.ReadMultilines) != 0; } + } + private readonly IHost _host; private const string RegistrationName = "TextLoader"; @@ -1147,6 +1160,8 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo _flags |= OptionFlags.AllowQuoting; if (options.AllowSparse) _flags |= OptionFlags.AllowSparse; + if (options.AllowQuoting && options.ReadMultilines) + _flags |= OptionFlags.ReadMultilines; // REVIEW: This should be persisted (if it should be maintained). _maxRows = options.MaxRows ?? long.MaxValue; diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs index fed1b97c7d..62f5709169 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs @@ -4,6 +4,7 @@ using System; using System.Collections.Generic; +using System.IO; using System.Text; using System.Threading; using System.Threading.Tasks; @@ -145,7 +146,7 @@ public static DataViewRowCursor Create(TextLoader parent, IMultiStreamSource fil SetupCursor(parent, active, 0, out srcNeeded, out cthd); Contracts.Assert(cthd > 0); - var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._maxRows, 1); + var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, 1); var stats = new ParseStats(parent._host, 1); return new Cursor(parent, stats, active, reader, srcNeeded, cthd); } @@ -162,7 +163,7 @@ public static DataViewRowCursor[] CreateSet(TextLoader parent, IMultiStreamSourc SetupCursor(parent, active, n, out srcNeeded, out cthd); Contracts.Assert(cthd > 0); - var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._maxRows, cthd); + var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, cthd); var stats = new ParseStats(parent._host, cthd); if (cthd <= 1) return new DataViewRowCursor[1] { new Cursor(parent, stats, active, reader, srcNeeded, 1) }; @@ -204,7 +205,7 @@ public override ValueGetter GetIdGetter() }; } - public static void GetSomeLines(IMultiStreamSource source, int count, ref List> lines) + public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, char[] separators, ref List> lines) { Contracts.AssertValue(source); Contracts.Assert(count > 0); @@ -214,7 +215,7 @@ public static void GetSomeLines(IMultiStreamSource source, int count, ref List= ichLim) + // We've reached the end of the line without finding the closing quote, + // so next line will start on this quoted field + return true; + + if (line[ichCur] == '"') + { + if (++ichCur >= ichLim) + // Last character in line was the closing quote of the field + return false; + + if (line[ichCur] == '"') + // 2 Double quotes means escaped quote + continue; + + // If it wasn't an escaped quote, then this is supposed to be + // the closing quote of the field, and there should only be spaces remaining + // until the next separator. + + if (!_sepsContainsSpace) + { + // Ignore leading spaces + while (ichCur < ichLim && line[ichCur] == ' ') + ichCur++; + } + + // If there's anything else than spaces or the next separator, + // this will actually be a QuotingError on the parser, so we decide that this + // line contains a quoting error, and so it's not going to be considered a valid field + // and the rest of the line should be ignored. + if (ichCur >= ichLim || IsSep(line[ichCur])) + return false; + + quotingError = true; + return false; + } + } + } + + // Unquoted field case. + // An unquoted field shouldn't contain new lines + while(ichCur < ichLim && !IsSep(line[ichCur])) + { + ichCur++; + } + return false; + } + + private bool IsSep(char ch) + { + if (ch == _sep0) + return true; + for (int i = 1; i < _separators.Length; i++) + { + if (ch == _separators[i]) + return true; + } + return false; + } + } + private void ThreadProc() { Contracts.Assert(_batchSize >= 2); @@ -480,6 +655,7 @@ private void ThreadProc() string path = _files.GetPathOrNull(ifile); using (var rdr = _files.OpenTextReader(ifile)) { + var multilineReader = new MultiLineReader(rdr, _separators); string text; long line = 0; for (; ; ) @@ -487,7 +663,11 @@ private void ThreadProc() // REVIEW: Avoid allocating a string for every line. This would probably require // introducing a CharSpan type (similar to ReadOnlyMemory but based on char[] or StringBuilder) // and implementing all the necessary conversion functionality on it. See task 3871. - text = rdr.ReadLine(); + if (_readMultilines) + text = multilineReader.ReadMultiLine(line, true); + else + text = rdr.ReadLine(); + if (text == null) goto LNext; line++; @@ -514,7 +694,11 @@ private void ThreadProc() if (_abort) return; - text = rdr.ReadLine(); + if (_readMultilines) + text = multilineReader.ReadMultiLine(line, false); + else + text = rdr.ReadLine(); + if (text == null) { // We're done with this file. Queue the last partial batch. diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs index dd45e8c240..13019c4bf2 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs @@ -1161,6 +1161,11 @@ private bool FetchNextField(ref ScanInfo scan, ReadOnlySpan span) scan.QuotingError = true; break; } + + // The logic below allow us to escape quotes (") inside quoted + // fields by using doublo quotes (""). I.e. when the loader + // encounters "" inside a quoted field, it will output only one " + // and continue parsing the rest of the field. if (span[ichCur] == '"') { if (ichCur > ichRun) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs index 779d08cec3..0fcc23fcef 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs @@ -64,7 +64,7 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog, HasHeader = hasHeader, AllowQuoting = allowQuoting, TrimWhitespace = trimWhitespace, - AllowSparse = allowSparse + AllowSparse = allowSparse, }; return new TextLoader(CatalogUtils.GetEnvironment(catalog), options: options, dataSample: dataSample); diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextSaver.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextSaver.cs index 0b3208707a..7ad78898fc 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextSaver.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextSaver.cs @@ -827,7 +827,7 @@ internal static void MapText(ReadOnlySpan span, ref StringBuilder sb, char for (; ichCur < ichLim; ichCur++) { char ch = span[ichCur]; - if (ch != '"' && ch != sep && ch != ':') + if (ch != '"' && ch != sep && ch != ':' && ch != '\n') continue; if (!quoted) { diff --git a/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMultilines-1-out.txt b/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMultilines-1-out.txt new file mode 100644 index 0000000000..64cdbe8894 --- /dev/null +++ b/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMultilines-1-out.txt @@ -0,0 +1,35 @@ +#@ TextLoader{ +#@ header+ +#@ sep=tab +#@ col=id:R4:0 +#@ col=description:TX:1 +#@ col=animal:TX:2 +#@ } +id description animal +10 this is a description dog +11 this is a quoted description cat +12 "this is a multiline +quoted description" bird +13 "this has one""doublequote which should be escaped as a single quote" dog +14 "this has ""doublequotes"" inside of it" cat +15 "this is a multiline +quoted description with + +""doublequotes"" and + +empty new lines and + + + +escaped quotes inside of ""it"" + +//and this comment with // shouldn't be ignored +since it is part of the ""multiline""" bird +16 here is text after the empty line dog +17 this is a line with an empty animal "" +0 this is a line with an empty id bird +19 "" dog +20 "we also allow""quotes in the middle of fields" cat +21 "or also at the end""" bird +22 this is the last row description dog +Wrote 13 rows of length 3 diff --git a/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMultilines-out.txt b/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMultilines-out.txt new file mode 100644 index 0000000000..64cdbe8894 --- /dev/null +++ b/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMultilines-out.txt @@ -0,0 +1,35 @@ +#@ TextLoader{ +#@ header+ +#@ sep=tab +#@ col=id:R4:0 +#@ col=description:TX:1 +#@ col=animal:TX:2 +#@ } +id description animal +10 this is a description dog +11 this is a quoted description cat +12 "this is a multiline +quoted description" bird +13 "this has one""doublequote which should be escaped as a single quote" dog +14 "this has ""doublequotes"" inside of it" cat +15 "this is a multiline +quoted description with + +""doublequotes"" and + +empty new lines and + + + +escaped quotes inside of ""it"" + +//and this comment with // shouldn't be ignored +since it is part of the ""multiline""" bird +16 here is text after the empty line dog +17 this is a line with an empty animal "" +0 this is a line with an empty id bird +19 "" dog +20 "we also allow""quotes in the middle of fields" cat +21 "or also at the end""" bird +22 this is the last row description dog +Wrote 13 rows of length 3 diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index d139710b10..67033afde3 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -405,6 +405,18 @@ "IsNullable": false, "Default": true }, + { + "Name": "ReadMultilines", + "Type": "Bool", + "Desc": "Escape new line characters inside a quoted field? If AllowQuoting is false, this argument is ignored.", + "Aliases": [ + "multilines" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, { "Name": "HeaderFile", "Type": "String", diff --git a/test/BaselineOutput/Common/TextLoader/multiline.csv b/test/BaselineOutput/Common/TextLoader/multiline.csv new file mode 100644 index 0000000000..4452061d56 --- /dev/null +++ b/test/BaselineOutput/Common/TextLoader/multiline.csv @@ -0,0 +1,39 @@ +10 +11 +12 +13 +14 +15 +16 +17 +0 +19 +20 +21 +22 +this is a description +this is a quoted description +this is a multiline\nquoted description +this has one"doublequote which should be escaped as a single quote +this has "doublequotes" inside of it +this is a multiline\nquoted description with\n\n"doublequotes" and\n\nempty new lines and\n\n\n\nescaped quotes inside of "it"\n\n//and this comment with // shouldn't be ignored\nsince it is part of the "multiline" +here is text after the empty line +this is a line with an empty animal +this is a line with an empty id + +we also allow"quotes in the middle of fields +or also at the end" +this is the last row description +dog +cat +bird +dog +cat +bird +dog + +bird +dog +cat +bird +dog \ No newline at end of file diff --git a/test/Microsoft.ML.TestFramework/TestCommandBase.cs b/test/Microsoft.ML.TestFramework/TestCommandBase.cs index 30ea91bb6d..6e5745d932 100644 --- a/test/Microsoft.ML.TestFramework/TestCommandBase.cs +++ b/test/Microsoft.ML.TestFramework/TestCommandBase.cs @@ -2150,6 +2150,23 @@ public void SavePipeChooseColumnsByIndex() Done(); } + [TestCategory("DataPipeSerialization")] + [Fact()] + public void SavePipeTextLoaderWithMultilines() + { + string dataPath = GetDataPath("multiline.csv"); + const string loaderArgs = "loader=text{sep=, quote+ multilines+ header+ col=id:Num:0 col=description:TX:1 col=animal:TX:2}"; + + OutputPath modelPath = ModelPath(); + string extraArgs = null; + TestCore("showdata", dataPath, loaderArgs, extraArgs); + + _step++; + + TestCore("showdata", dataPath, string.Format("in={{{0}}}", modelPath.Path), ""); + Done(); + } + [TestCategory("DataPipeSerialization")] [Fact()] public void SavePipeChooseColumnsByIndexDrop() diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index 1f61227bba..b2421bacce 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -660,7 +660,7 @@ public class ModelWithColumnNameAttribute } } - public class TextLoaderFromModelTests : BaseTestClass + public class TextLoaderFromModelTests : BaseTestBaseline { public TextLoaderFromModelTests(ITestOutputHelper output) : base(output) @@ -937,5 +937,147 @@ public void TestLoadTextWithoutKeyTypeAttribute() Assert.Equal(expectedCount, data.Schema[1].Type.GetKeyCount()); } + + [Theory] + [InlineData(true)] + [InlineData(false)] + public void TestLoadTextWithEscapedNewLines(bool useSaved) + { + var mlContext = new MLContext(seed: 1); + var dataPath = GetDataPath("multiline.csv"); + var baselinePath = GetBaselinePath("TextLoader", "multiline.csv"); + var options = new TextLoader.Options() + { + HasHeader = true, + Separator = ",", + AllowQuoting = true, + ReadMultilines = true, + Columns = new[] + { + new TextLoader.Column("id", DataKind.Int32, 0), + new TextLoader.Column("description", DataKind.String, 1), + new TextLoader.Column("animal", DataKind.String, 2), + }, + }; + + var data = mlContext.Data.LoadFromTextFile(dataPath, options); + if (useSaved) + { + // Check that loading the data view from a text file, + // and then saving that data view to another text file, then loading it again + // also matches the baseline. + + var savedPath = DeleteOutputPath("saved-multiline.tsv"); + using (var fs = File.Create(savedPath)) + mlContext.Data.SaveAsText(data, fs, separatorChar: '\t'); + + options.Separator = "\t"; + data = mlContext.Data.LoadFromTextFile(savedPath, options); + } + + // Get values from loaded dataview + var ids = new List(); + var descriptions = new List(); + var animals = new List(); + using(var curs = data.GetRowCursorForAllColumns()) + { + var idGetter = curs.GetGetter(data.Schema["id"]); + var descriptionGetter = curs.GetGetter>(data.Schema["description"]); + var animalGetter = curs.GetGetter>(data.Schema["animal"]); + + int id = default; + ReadOnlyMemory description = default; + ReadOnlyMemory animal = default; + + while(curs.MoveNext()) + { + idGetter(ref id); + descriptionGetter(ref description); + animalGetter(ref animal); + + ids.Add(id.ToString()); + descriptions.Add(description.ToString()); + animals.Add(animal.ToString()); + } + } + + const int numRows = 13; + Assert.Equal(numRows, ids.Count()); + Assert.Equal(numRows, descriptions.Count()); + Assert.Equal(numRows, animals.Count()); + + // Compare values with baseline file + string line; + using (var file = new StreamReader(baselinePath)) + { + for(int i = 0; i < numRows; i++) + { + line = file.ReadLine(); + Assert.Equal(ids[i], line); + } + + for (int i = 0; i < numRows; i++) + { + line = file.ReadLine(); + line = line.Replace("\\n", "\n"); + Assert.Equal(descriptions[i], line); + } + + for (int i = 0; i < numRows; i++) + { + line = file.ReadLine(); + Assert.Equal(animals[i], line); + } + } + } + + [Fact] + public void TestInvalidMultilineCSVQuote() + { + var mlContext = new MLContext(seed: 1); + + string badInputCsv = + "id,description,animal\n" + + "9,\"this is a quoted field correctly formatted\",cat\n" + + "10,\"this is a quoted field\nwithout closing quote,cat\n" + + "11,this field isn't quoted,dog\n" + + "12,this will reach the end of the file without finding a closing quote so it will throw,frog\n" + ; + + var filePath = GetOutputPath("multiline-invalid.csv"); + File.WriteAllText(filePath, badInputCsv); + + bool threwException = false; + try + { + var options = new TextLoader.Options() + { + HasHeader = true, + Separator = ",", + AllowQuoting = true, + ReadMultilines = true, + Columns = new[] + { + new TextLoader.Column("id", DataKind.Int32, 0), + new TextLoader.Column("description", DataKind.String, 1), + new TextLoader.Column("animal", DataKind.String, 2), + }, + }; + + var data = mlContext.Data.LoadFromTextFile(filePath, options); + + data.Preview(); + } + catch(EndOfStreamException) + { + threwException = true; + } + catch(FormatException) + { + threwException = true; + } + + Assert.True(threwException, "Invalid file should have thrown an exception"); + } } } diff --git a/test/data/multiline.csv b/test/data/multiline.csv new file mode 100644 index 0000000000..2defac230a --- /dev/null +++ b/test/data/multiline.csv @@ -0,0 +1,34 @@ +// this file should be loaded with quoting and readmultiline enabled +// and it should load without problems by the TextLoader +id,description,animal +// this is a comment that will be ignored +// this is a comment with "quotes" that will also be ignored +// this is a comment with a "quote without close quote that will also be ignored +10,this is a description,dog +11,"this is a quoted description",cat +12,"this is a multiline +quoted description", bird +13,"this has one""doublequote which should be escaped as a single quote",dog +14,"this has ""doublequotes"" inside of it",cat +15, "this is a multiline +quoted description with + +""doublequotes"" and + +empty new lines and + + + +escaped quotes inside of ""it"" + +//and this comment with // shouldn't be ignored +since it is part of the ""multiline""",bird +// this line should be ignored, and the next line is empty: + +16, here is text after the empty line, dog +17, this is a line with an empty animal,"" +"", this is a line with an empty id, bird +19,"",dog +20,we also allow"quotes in the middle of fields,cat +21,or also at the end",bird +22,this is the last row description,dog \ No newline at end of file