From a5ac0226c6f6a0994fb00a7a865a526042f8eb14 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Thu, 28 May 2020 00:00:49 -0700 Subject: [PATCH 01/21] Updated MAML HelpText --- src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index 2cd1e8e8cf..7c89712716 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -433,10 +433,9 @@ public class Options /// [Argument(ArgumentType.AtMostOnce, HelpText = - "Whether the input may include quoted values, which can contain separator characters, colons," + - " and distinguish empty values from missing values. When true, consecutive separators denote a" + - " missing value and an empty value is denoted by \"\". When false, consecutive separators" + - " denote an empty value.", + "Whether the input may include double-quoted values. This parameter is used to distinguish separator characters in an input value" + + "from actual separators. When true, separators within double quotes are treated as part of the input value. When false, all" + + "separators, even those within quotes, are treated as delimiting a new column.", ShortName = "quote")] public bool AllowQuoting = Defaults.AllowQuoting; From e7e7da7a8dbaa8b4d53b2fe7c99dfbbb15768018 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Thu, 28 May 2020 01:32:37 -0700 Subject: [PATCH 02/21] Added ImputeEmptyFloats option in TextLoader and EmptyAsNaN in DoubleParser. Made changes to make them work. --- .../Utilities/DoubleParser.cs | 21 +++++++++++++++---- .../DataLoadSave/Text/TextLoader.cs | 14 +++++++++++-- .../DataLoadSave/Text/TextLoaderParser.cs | 13 +++++++++++- 3 files changed, 41 insertions(+), 7 deletions(-) diff --git a/src/Microsoft.ML.Core/Utilities/DoubleParser.cs b/src/Microsoft.ML.Core/Utilities/DoubleParser.cs index 58ee99f5ef..d9d18ba5f7 100644 --- a/src/Microsoft.ML.Core/Utilities/DoubleParser.cs +++ b/src/Microsoft.ML.Core/Utilities/DoubleParser.cs @@ -23,6 +23,11 @@ internal enum OptionFlags : uint // a number and its decimal part). If this isn't set, then // default behavior is to use "." as decimal marker. UseCommaAsDecimalMarker = 0x01, + + // If this flag is set, then empty spans (or those with only spaces) + // will be parsed as NaN. If it isn't set, then default behavior + // is to return them as 0. + EmptyAsNaN = 0x02, } private const ulong TopBit = 0x8000000000000000UL; @@ -86,7 +91,7 @@ public enum Result public static bool TryParse(ReadOnlySpan span, out Single value, OptionFlags flags = OptionFlags.Default) { var res = Parse(span, out value, flags); - Contracts.Assert(res != Result.Empty || value == 0); + Contracts.Assert(res != Result.Empty || ((flags & OptionFlags.EmptyAsNaN) == 0 && value == 0) || Single.IsNaN(value)); return res <= Result.Empty; } @@ -96,7 +101,7 @@ public static bool TryParse(ReadOnlySpan span, out Single value, OptionFla public static bool TryParse(ReadOnlySpan span, out Double value, OptionFlags flags = OptionFlags.Default) { var res = Parse(span, out value, flags); - Contracts.Assert(res != Result.Empty || value == 0); + Contracts.Assert(res != Result.Empty || ((flags & OptionFlags.EmptyAsNaN) == 0 && value == 0) || Double.IsNaN(value)); return res <= Result.Empty; } @@ -107,7 +112,11 @@ public static Result Parse(ReadOnlySpan span, out Single value, OptionFlag { if (ich >= span.Length) { - value = 0; + if ((flags & OptionFlags.EmptyAsNaN) == 0) + value = 0; + else + value = Single.NaN; + return Result.Empty; } if (!char.IsWhiteSpace(span[ich])) @@ -155,7 +164,11 @@ public static Result Parse(ReadOnlySpan span, out Double value, OptionFlag { if (ich >= span.Length) { - value = 0; + if ((flags & OptionFlags.EmptyAsNaN) == 0) + value = 0; + else + value = Double.NaN; + return Result.Empty; } if (!char.IsWhiteSpace(span[ich])) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index 7c89712716..23fe7862ba 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -532,6 +532,12 @@ public class Options [Argument(ArgumentType.AtMostOnce, HelpText = "Character to use to escape quotes inside quoted fields. It can't be a character used as separator.", ShortName = "escapechar")] public char EscapeChar = Defaults.EscapeChar; + /// + /// If true, empty float fields will be loaded as NaN. If false, they'll be loaded as 0. Default is false. + /// + [Argument(ArgumentType.AtMostOnce, HelpText = "If true, empty float fields will be loaded as NaN. If false, they'll be loaded as 0. Default is false.", ShortName = "imputefloat")] + public bool ImputeEmptyFloats = Defaults.ImputeEmptyFloats; + /// /// Checks that all column specifications are valid (that is, ranges are disjoint and have min<=max). /// @@ -551,6 +557,7 @@ internal static class Defaults internal const bool TrimWhitespace = false; internal const bool ReadMultilines = false; internal const char EscapeChar = '"'; + internal const bool ImputeEmptyFloats = false; } /// @@ -1077,7 +1084,7 @@ private static VersionInfo GetVersionInfo() //verWrittenCur: 0x0001000A, // Added ForceVector in Range //verWrittenCur: 0x0001000B, // Header now retained if used and present //verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags - verWrittenCur: 0x0001000D, // Added escapeChar option and decimal marker option to allow for ',' to be a decimal marker + verWrittenCur: 0x0001000D, // Added escapeChar and decimalMarker chars and imputeEmptyFloats flag verReadableCur: 0x0001000A, verWeCanReadBack: 0x00010009, loaderSignature: LoaderSignature, @@ -1096,7 +1103,8 @@ private enum OptionFlags : uint AllowQuoting = 0x04, AllowSparse = 0x08, ReadMultilines = 0x10, - All = TrimWhitespace | HasHeader | AllowQuoting | AllowSparse | ReadMultilines + ImputeEmptyFloats = 0x20, + All = TrimWhitespace | HasHeader | AllowQuoting | AllowSparse | ReadMultilines | ImputeEmptyFloats } // This is reserved to mean the range extends to the end (the segment is variable). @@ -1178,6 +1186,8 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo _flags |= OptionFlags.AllowSparse; if (options.AllowQuoting && options.ReadMultilines) _flags |= OptionFlags.ReadMultilines; + if (options.ImputeEmptyFloats) + _flags |= OptionFlags.ImputeEmptyFloats; // REVIEW: This should be persisted (if it should be maintained). _maxRows = options.MaxRows ?? long.MaxValue; diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs index 897c257e2a..9082ed03e8 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs @@ -671,6 +671,8 @@ public Parser(TextLoader parent) var doubleParserOptionFlags = DoubleParser.OptionFlags.Default; if (parent._decimalMarker == ',') doubleParserOptionFlags |= DoubleParser.OptionFlags.UseCommaAsDecimalMarker; + if ((parent._flags & OptionFlags.ImputeEmptyFloats) != 0) + doubleParserOptionFlags |= DoubleParser.OptionFlags.EmptyAsNaN; if (doubleParserOptionFlags == DoubleParser.OptionFlags.Default) cache = ValueCreatorCache.DefaultInstance; @@ -900,6 +902,7 @@ private sealed class HelperImpl : Helper private readonly int _srcNeeded; private readonly bool _quoting; private readonly bool _sparse; + private readonly bool _keepEmpty; // This is a working buffer. private readonly StringBuilder _sb; @@ -927,6 +930,7 @@ public HelperImpl(ParseStats stats, OptionFlags flags, char[] seps, char escapeC _srcNeeded = srcNeeded; _quoting = (flags & OptionFlags.AllowQuoting) != 0; _sparse = (flags & OptionFlags.AllowSparse) != 0; + _keepEmpty = (flags & OptionFlags.ImputeEmptyFloats) != 0; _sb = new StringBuilder(); _blank = ReadOnlyMemory.Empty; Fields = new FieldSet(); @@ -972,12 +976,19 @@ public int GatherFields(ReadOnlyMemory lineSpan, ReadOnlySpan span, if (scan.QuotingError) _stats.LogBadFmt(ref scan, "Illegal quoting"); - if (!scan.Span.IsEmpty) + if (!scan.Span.IsEmpty || _keepEmpty) { Fields.EnsureSpace(); Fields.Spans[Fields.Count] = scan.Span; Fields.Indices[Fields.Count++] = src; } + else if(_keepEmpty) + { + Fields.EnsureSpace(); + Fields.Spans[Fields.Count] = ReadOnlyMemory.Empty; + Fields.Indices[Fields.Count++] = src; + } + if (++src > _srcNeeded || !more) break; } From a7a454ff160d9edaaa95242bb315dc47888a8956 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Thu, 28 May 2020 01:41:15 -0700 Subject: [PATCH 03/21] Small changes related to _keepEmpty --- .../DataLoadSave/Text/TextLoaderParser.cs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs index 9082ed03e8..2c643de304 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs @@ -930,10 +930,14 @@ public HelperImpl(ParseStats stats, OptionFlags flags, char[] seps, char escapeC _srcNeeded = srcNeeded; _quoting = (flags & OptionFlags.AllowQuoting) != 0; _sparse = (flags & OptionFlags.AllowSparse) != 0; - _keepEmpty = (flags & OptionFlags.ImputeEmptyFloats) != 0; _sb = new StringBuilder(); _blank = ReadOnlyMemory.Empty; Fields = new FieldSet(); + + // If we want to impute empty float fields, then we must keep + // all empty fields spans, as there's no way for the Parser.HelperImpl + // to know beforehand which fields belong to a float field + _keepEmpty = (flags & OptionFlags.ImputeEmptyFloats) != 0; } /// @@ -985,7 +989,7 @@ public int GatherFields(ReadOnlyMemory lineSpan, ReadOnlySpan span, else if(_keepEmpty) { Fields.EnsureSpace(); - Fields.Spans[Fields.Count] = ReadOnlyMemory.Empty; + Fields.Spans[Fields.Count] = _blank; Fields.Indices[Fields.Count++] = src; } From 7d103584ed101bc67672caa8511390eaf7e8ea24 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Tue, 2 Jun 2020 11:01:58 -0700 Subject: [PATCH 04/21] Minor change --- src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs index 2c643de304..406a0bd989 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs @@ -980,7 +980,7 @@ public int GatherFields(ReadOnlyMemory lineSpan, ReadOnlySpan span, if (scan.QuotingError) _stats.LogBadFmt(ref scan, "Illegal quoting"); - if (!scan.Span.IsEmpty || _keepEmpty) + if (!scan.Span.IsEmpty) { Fields.EnsureSpace(); Fields.Spans[Fields.Count] = scan.Span; From 4dd081ce805993be629c051b2b81507799712cc8 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Tue, 2 Jun 2020 11:22:03 -0700 Subject: [PATCH 05/21] Updated docs --- docs/code/IDataViewTypeSystem.md | 5 ++++- src/Microsoft.ML.Core/Utilities/DoubleParser.cs | 4 ++-- src/Microsoft.ML.Data/Data/Conversion.cs | 15 +++++++++++++-- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/docs/code/IDataViewTypeSystem.md b/docs/code/IDataViewTypeSystem.md index 76e32c22ca..fba846c31a 100644 --- a/docs/code/IDataViewTypeSystem.md +++ b/docs/code/IDataViewTypeSystem.md @@ -539,7 +539,10 @@ is first processed entirely as `TX` values, then parsed, or processed directly into numeric values, that is, parsing as the row is processed. In the latter case, it is simple to map implicit items (suppressed due to sparsity) to zero. In the former case, these items are first mapped to the empty text value. To -get the same result, we need empty text to map to zero. +get the same result, we need empty text to map to zero. An exception to this +rule has been permitted in the TextLoader, where there's an option to load +empty `TX` fields as `NaN` for `R4` and `R8` fields, instead of using the default +conversion of empty `TX` to the numeric default `0`. ### Floating Point diff --git a/src/Microsoft.ML.Core/Utilities/DoubleParser.cs b/src/Microsoft.ML.Core/Utilities/DoubleParser.cs index d9d18ba5f7..dd13517731 100644 --- a/src/Microsoft.ML.Core/Utilities/DoubleParser.cs +++ b/src/Microsoft.ML.Core/Utilities/DoubleParser.cs @@ -86,7 +86,7 @@ public enum Result } /// - /// This produces zero for an empty string. + /// This produces zero for an empty string, or NaN depending on the used. /// public static bool TryParse(ReadOnlySpan span, out Single value, OptionFlags flags = OptionFlags.Default) { @@ -96,7 +96,7 @@ public static bool TryParse(ReadOnlySpan span, out Single value, OptionFla } /// - /// This produces zero for an empty string. + /// This produces zero for an empty string, or NaN depending on the used. /// public static bool TryParse(ReadOnlySpan span, out Double value, OptionFlags flags = OptionFlags.Default) { diff --git a/src/Microsoft.ML.Data/Data/Conversion.cs b/src/Microsoft.ML.Data/Data/Conversion.cs index 322856c22f..29f35069dd 100644 --- a/src/Microsoft.ML.Data/Data/Conversion.cs +++ b/src/Microsoft.ML.Data/Data/Conversion.cs @@ -1336,7 +1336,8 @@ private void TryParseSigned(long max, in TX text, out long? result) } /// - /// This produces zero for empty. It returns false if the text is not parsable. + /// This produces zero for empty, or NaN depending on the used. + /// It returns false if the text is not parsable. /// On failure, it sets dst to the NA value. /// public bool TryParse(in TX src, out R4 dst) @@ -1349,7 +1350,8 @@ public bool TryParse(in TX src, out R4 dst) } /// - /// This produces zero for empty. It returns false if the text is not parsable. + /// This produces zero for empty, or NaN depending on the used. + /// It returns false if the text is not parsable. /// On failure, it sets dst to the NA value. /// public bool TryParse(in TX src, out R8 dst) @@ -1361,6 +1363,9 @@ public bool TryParse(in TX src, out R8 dst) return IsStdMissing(ref span); } + /// + /// This produces default for empty. + /// public bool TryParse(in TX src, out TS dst) { if (src.IsEmpty) @@ -1375,6 +1380,9 @@ public bool TryParse(in TX src, out TS dst) return false; } + /// + /// This produces default for empty. + /// public bool TryParse(in TX src, out DT dst) { if (src.IsEmpty) @@ -1389,6 +1397,9 @@ public bool TryParse(in TX src, out DT dst) return false; } + /// + /// This produces default for empty. + /// public bool TryParse(in TX src, out DZ dst) { if (src.IsEmpty) From b952e71d871b8f801ebcb0973ae6a8727e2548ea Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Tue, 2 Jun 2020 19:25:06 -0700 Subject: [PATCH 06/21] Added Test --- .../TextLoader/missing_fields-with-impute.csv | 18 ++ .../missing_fields-without-impute.csv | 15 ++ test/Microsoft.ML.Tests/TextLoaderTests.cs | 174 +++++++++++++++++- test/data/missing_fields.csv | 15 ++ 4 files changed, 221 insertions(+), 1 deletion(-) create mode 100644 test/BaselineOutput/Common/TextLoader/missing_fields-with-impute.csv create mode 100644 test/BaselineOutput/Common/TextLoader/missing_fields-without-impute.csv create mode 100644 test/data/missing_fields.csv diff --git a/test/BaselineOutput/Common/TextLoader/missing_fields-with-impute.csv b/test/BaselineOutput/Common/TextLoader/missing_fields-with-impute.csv new file mode 100644 index 0000000000..fdf3522f51 --- /dev/null +++ b/test/BaselineOutput/Common/TextLoader/missing_fields-with-impute.csv @@ -0,0 +1,18 @@ +int,description,num1,num2,date,num3,num4 +0,"this is a description",0.12,0.34,01/01/2001,0.56,0.78 +0,"this has an empty int and date", 1.1, 11.11,1/1/0001,111.111,1111.11111 +0,"this has a quoted empty int and date", 1.1, 11.11,1/1/0001,111.111,1111.11111 +1,"this has a quoted int and date", 1.1, 11.11,1/1/2001,111.111,1111.11111 +2,"this has an empty num1 and a space in num3",NaN,22.22,2/2/2002,NaN,2222.2222 +3,"this has an empty quoted num1 and a quoted space in num3",NaN,33.33,3/3/2003,NaN,3333.3333 +4,"this has a space in num2 and a space in num4",4.4,NaN,4/4/2004,444.444,NaN +5,"this has a quoted space num2 and quoted space in num4",5.5,NaN,5/5/2005,555.555,NaN +// The next two rows map the missing columns as 0, as it was decided not to impute with NaN +// in this case +6,"this has no date, num3 or num4 (the separator corresponding to them is also missing)",6.6,66.66,1/1/0001,0,0 +7,"this has no num4 (the separator corresponding to it is missing)",7.7,77.77,7/7/2007,777.777,0 +// In the next case we do impute with NaN because the separator is there +8,"this has nothing in num4, but includes the last separator",8.8,88.88,8/8/2008,888.888,NaN +9,,9.9,99.99,9/9/2009,999.999,NaN +0,"",10.10,NaN,10/10/2010,101010.101010,NaN +11,NaN,NaN,NaN,11/11/2011,NaN,Infinity \ No newline at end of file diff --git a/test/BaselineOutput/Common/TextLoader/missing_fields-without-impute.csv b/test/BaselineOutput/Common/TextLoader/missing_fields-without-impute.csv new file mode 100644 index 0000000000..95c6874c08 --- /dev/null +++ b/test/BaselineOutput/Common/TextLoader/missing_fields-without-impute.csv @@ -0,0 +1,15 @@ +int,description,num1,num2,date,num3,num4 +0,"this is a description",0.12,0.34,01/01/2001,0.56,0.78 +0,"this has an empty int and date", 1.1, 11.11,1/1/0001,111.111,1111.11111 +0,"this has a quoted empty int and date", 1.1, 11.11,1/1/0001,111.111,1111.11111 +1,"this has a quoted int and date", 1.1, 11.11,1/1/2001,111.111,1111.11111 +2,"this has an empty num1 and a space in num3",0,22.22,2/2/2002,0,2222.2222 +3,"this has an empty quoted num1 and a quoted space in num3",0,33.33,3/3/2003,0,3333.3333 +4,"this has a space in num2 and a space in num4",4.4,0,4/4/2004,444.444,0 +5,"this has a quoted space num2 and quoted space in num4",5.5,0,5/5/2005,555.555,0 +6,"this has no date, num3 or num4 (the separator corresponding to them is also missing)",6.6,66.66,1/1/0001,0,0 +7,"this has no num4 (the separator corresponding to it is missing)",7.7,77.77,7/7/2007,777.777,0 +8,"this has nothing in num4, but includes the last separator",8.8,88.88,8/8/2008,888.888,0 +9,,9.9,99.99,9/9/2009,999.999,NaN +0,,10.10,NaN,10/10/2010,101010.101010,NaN +11,NaN,NaN,NaN,11/11/2011,0,Infinity \ No newline at end of file diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index 358fad1272..278c0525c9 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -719,7 +719,7 @@ public void LoaderColumnsFromIrisData(bool useOptionsObject) irisFirstRow["SepalWidth"] = 3.5f; irisFirstRow["PetalLength"] = 1.4f; irisFirstRow["PetalWidth"] = 0.2f; - + var irisFirstRowValues = irisFirstRow.Values.GetEnumerator(); // Simple load @@ -1416,5 +1416,177 @@ public void TestInvalidMultilineCSVQuote() Assert.True(threwException, "Invalid file should have thrown an exception"); } + + [Theory] + [InlineData(true)] + [InlineData(false)] + public void TestLoadTextWithEmptyFloat(bool useImputeEmptyFloats) + { + var mlContext = new MLContext(seed: 1); + var inputPath = GetDataPath("missing_fields.csv"); + var baselineWithImpute = GetBaselinePath("TextLoader", "missing_fields-with-impute.csv"); + var baselineWithoutImpute = GetBaselinePath("TextLoader", "missing_fields-without-impute.csv"); + + var options = new TextLoader.Options() + { + HasHeader = true, + Separator = ",", + AllowQuoting = true, + Columns = new[] + { + new TextLoader.Column("id", DataKind.Int32, 0), + new TextLoader.Column("description", DataKind.String, 1), + new TextLoader.Column("date", DataKind.DateTime, 4), + new TextLoader.Column("sing1", DataKind.Single, 2), + new TextLoader.Column("sing2", DataKind.Single, 3), + new TextLoader.Column("singFt1", DataKind.Single, new [] { new TextLoader.Range(2,3) } ), + new TextLoader.Column("sing3", DataKind.Single, 5), + new TextLoader.Column("sing4", DataKind.Single, 6), + new TextLoader.Column("singFt2", DataKind.Single, new [] { new TextLoader.Range(2,3), new TextLoader.Range(5,6) } ), + new TextLoader.Column("doub1", DataKind.Double, 2), + new TextLoader.Column("doub2", DataKind.Double, 3), + new TextLoader.Column("doubFt1", DataKind.Double, new [] { new TextLoader.Range(2,3) } ), + new TextLoader.Column("doub3", DataKind.Double, 5), + new TextLoader.Column("doub4", DataKind.Double, 6), + new TextLoader.Column("doubFt2", DataKind.Double, new [] { new TextLoader.Range(2,3), new TextLoader.Range(5,6) } ) + }, + }; + + IDataView baselineDV; + IDataView testDV ; + if(useImputeEmptyFloats) + { + baselineDV = mlContext.Data.LoadFromTextFile(baselineWithImpute, options); + options.ImputeEmptyFloats = true; + testDV = mlContext.Data.LoadFromTextFile(inputPath, options); + } + else + { + baselineDV = mlContext.Data.LoadFromTextFile(baselineWithoutImpute, options); + testDV = mlContext.Data.LoadFromTextFile(inputPath, options); + } + + Int32 baselineId = default; + ReadOnlyMemory baselineDescription = default; + DateTime baselineDate = default; + Single baselineSing1 = default; + Single baselineSing2 = default; + Single baselineSing3 = default; + Single baselineSing4 = default; + Double baselineDoub1 = default; + Double baselineDoub2 = default; + Double baselineDoub3 = default; + Double baselineDoub4 = default; + + Int32 testId = default; + ReadOnlyMemory testDescription = default; + DateTime testDate = default; + Single testSing1 = default; + Single testSing2 = default; + Single testSing3 = default; + Single testSing4 = default; + VBuffer testSingFt1 = default; + VBuffer testSingFt2 = default; + Double testDoub1 = default; + Double testDoub2 = default; + Double testDoub3 = default; + Double testDoub4 = default; + VBuffer testDoubFt1 = default; + VBuffer testDoubFt2 = default; + + using (var cursorBaseline = baselineDV.GetRowCursor(baselineDV.Schema)) + using (var cursorTest = testDV.GetRowCursor(testDV.Schema)) + { + var delegateBaselineId = cursorBaseline.GetGetter(baselineDV.Schema["id"]); + var delegateBaselineDescription = cursorBaseline.GetGetter>(baselineDV.Schema["description"]); + var delegateBaselineDate = cursorBaseline.GetGetter(baselineDV.Schema["date"]); + var delegateBaselineSing1 = cursorBaseline.GetGetter(baselineDV.Schema["sing1"]); + var delegateBaselineSing2 = cursorBaseline.GetGetter(baselineDV.Schema["sing2"]); + var delegateBaselineSing3 = cursorBaseline.GetGetter(baselineDV.Schema["sing3"]); + var delegateBaselineSing4 = cursorBaseline.GetGetter(baselineDV.Schema["sing4"]); + var delegateBaselineDoub1 = cursorBaseline.GetGetter(baselineDV.Schema["doub1"]); + var delegateBaselineDoub2 = cursorBaseline.GetGetter(baselineDV.Schema["doub2"]); + var delegateBaselineDoub3 = cursorBaseline.GetGetter(baselineDV.Schema["doub3"]); + var delegateBaselineDoub4 = cursorBaseline.GetGetter(baselineDV.Schema["doub4"]); + + var delegateTestId = cursorTest.GetGetter(testDV.Schema["id"]); + var delegateTestDescription = cursorTest.GetGetter>(testDV.Schema["description"]); + var delegateTestDate = cursorTest.GetGetter(testDV.Schema["date"]); + var delegateTestSing1 = cursorTest.GetGetter(testDV.Schema["sing1"]); + var delegateTestSing2 = cursorTest.GetGetter(testDV.Schema["sing2"]); + var delegateTestSing3 = cursorTest.GetGetter(testDV.Schema["sing3"]); + var delegateTestSing4 = cursorTest.GetGetter(testDV.Schema["sing4"]); + var delegateTestSingFt1 = cursorTest.GetGetter>(testDV.Schema["singFt1"]); + var delegateTestSingFt2 = cursorTest.GetGetter>(testDV.Schema["singFt2"]); + var delegateTestDoub1 = cursorTest.GetGetter(testDV.Schema["doub1"]); + var delegateTestDoub2 = cursorTest.GetGetter(testDV.Schema["doub2"]); + var delegateTestDoub3 = cursorTest.GetGetter(testDV.Schema["doub3"]); + var delegateTestDoub4 = cursorTest.GetGetter(testDV.Schema["doub4"]); + var delegateTestDoubFt1 = cursorTest.GetGetter>(testDV.Schema["doubFt1"]); + var delegateTestDoubFt2 = cursorTest.GetGetter>(testDV.Schema["doubFt2"]); + + + while (cursorBaseline.MoveNext() && cursorTest.MoveNext()) + { + delegateBaselineId(ref baselineId); + delegateBaselineDescription(ref baselineDescription); + delegateBaselineDate(ref baselineDate); + delegateBaselineSing1(ref baselineSing1); + delegateBaselineSing2(ref baselineSing2); + delegateBaselineSing3(ref baselineSing3); + delegateBaselineSing4(ref baselineSing4); + delegateBaselineDoub1(ref baselineDoub1); + delegateBaselineDoub2(ref baselineDoub2); + delegateBaselineDoub3(ref baselineDoub3); + delegateBaselineDoub4(ref baselineDoub4); + + delegateTestId(ref testId); + delegateTestDescription(ref testDescription); + delegateTestDate(ref testDate); + delegateTestSing1(ref testSing1); + delegateTestSing2(ref testSing2); + delegateTestSing3(ref testSing3); + delegateTestSing4(ref testSing4); + delegateTestSingFt1(ref testSingFt1); + delegateTestSingFt2(ref testSingFt2); + delegateTestDoub1(ref testDoub1); + delegateTestDoub2(ref testDoub2); + delegateTestDoub3(ref testDoub3); + delegateTestDoub4(ref testDoub4); + delegateTestDoubFt1(ref testDoubFt1); + delegateTestDoubFt2(ref testDoubFt2); + + Assert.Equal(baselineId, testId); + Assert.Equal(baselineDescription.ToString(), testDescription.ToString()); + Assert.Equal(baselineDate, testDate); + Assert.Equal(baselineSing1, testSing1); + Assert.Equal(baselineSing2, testSing2); + Assert.Equal(baselineSing3, testSing3); + Assert.Equal(baselineSing4, testSing4); + Assert.Equal(baselineDoub1, testDoub1); + Assert.Equal(baselineDoub2, testDoub2); + Assert.Equal(baselineDoub3, testDoub3); + Assert.Equal(baselineDoub4, testDoub4); + + var testSingFt1Arr = testSingFt1.DenseValues().ToArray(); + var testSingFt2Arr = testSingFt2.DenseValues().ToArray(); + Assert.Equal(baselineSing1, testSingFt1Arr[0]); + Assert.Equal(baselineSing2, testSingFt1Arr[1]); + Assert.Equal(baselineSing1, testSingFt2Arr[0]); + Assert.Equal(baselineSing2, testSingFt2Arr[1]); + Assert.Equal(baselineSing3, testSingFt2Arr[2]); + Assert.Equal(baselineSing4, testSingFt2Arr[3]); + + var testDoubFt1Arr = testDoubFt1.DenseValues().ToArray(); + var testDoubFt2Arr = testDoubFt2.DenseValues().ToArray(); + Assert.Equal(baselineDoub1, testDoubFt1Arr[0]); + Assert.Equal(baselineDoub2, testDoubFt1Arr[1]); + Assert.Equal(baselineDoub1, testDoubFt2Arr[0]); + Assert.Equal(baselineDoub2, testDoubFt2Arr[1]); + Assert.Equal(baselineDoub3, testDoubFt2Arr[2]); + Assert.Equal(baselineDoub4, testDoubFt2Arr[3]); + } + } + } } } diff --git a/test/data/missing_fields.csv b/test/data/missing_fields.csv new file mode 100644 index 0000000000..1eb725b584 --- /dev/null +++ b/test/data/missing_fields.csv @@ -0,0 +1,15 @@ +int,description,num1,num2,date,num3,num4 +0,"this is a description",0.12,0.34,01/01/2001,0.56,0.78 +,"this has an empty int and date", 1.1, 11.11,,111.111,1111.11111 +"","this has a quoted empty int and date", 1.1, 11.11,"",111.111,1111.11111 +"1","this has a quoted int and date", 1.1, 11.11,"1/1/2001",111.111,1111.11111 +2,"this has an empty num1 and a space in num3",,22.22,2/2/2002, ,2222.2222 +3,"this has an empty quoted num1 and a quoted space in num3","",33.33,3/3/2003," ",3333.3333 +4,"this has a space in num2 and a space in num4",4.4, ,4/4/2004,444.444, +5,"this has a quoted space num2 and quoted space in num4",5.5,"",5/5/2005,555.555," " +6,"this has no date, num3 or num4 (the separator corresponding to them is also missing)",6.6,66.66 +7,"this has no num4 (the separator corresponding to it is missing)",7.7,77.77,7/7/2007,777.777 +8,"this has nothing in num4, but includes the last separator",8.8,88.88,8/8/2008,888.888, +9,"",9.9,99.99,9/9/2009,999.999,NaN +,,10.10,?,10/10/2010,101010.101010,? +11,NaN,?,NaN,11/11/2011,,Infinity \ No newline at end of file From c0cdbec7e6634b130297ffc6e79d5512e68ae212 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Tue, 2 Jun 2020 19:25:34 -0700 Subject: [PATCH 07/21] Typo in test --- test/Microsoft.ML.Tests/TextLoaderTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index 278c0525c9..4d4bb84b1e 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -1398,7 +1398,7 @@ public void TestInvalidMultilineCSVQuote() new TextLoader.Column("id", DataKind.Int32, 0), new TextLoader.Column("description", DataKind.String, 1), new TextLoader.Column("animal", DataKind.String, 2), - }, + }, }; var data = mlContext.Data.LoadFromTextFile(filePath, options); From 684d359541eb2729c29af798d7aec1fc3040a4c9 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Tue, 2 Jun 2020 20:58:54 -0700 Subject: [PATCH 08/21] Revert "new code coverage (#5169)" This reverts commit c8cace2b1bb4078be544423f20888b09921270e3. --- build.proj | 13 ------------- build/Codecoverage.proj | 4 ++-- build/Dependencies.props | 2 +- test/Directory.Build.props | 22 ++++++++++++++++++++-- test/coverlet.runsettings | 18 ------------------ test/run-tests.proj | 7 +------ 6 files changed, 24 insertions(+), 42 deletions(-) delete mode 100644 test/coverlet.runsettings diff --git a/build.proj b/build.proj index 1d95743b60..f7f3624679 100644 --- a/build.proj +++ b/build.proj @@ -38,7 +38,6 @@ $(TraversalBuildDependsOn); DownloadExternalTestFiles; DownloadTensorflowMetaFiles; - DeleteTestHost; @@ -117,18 +116,6 @@ - - - - - - - - - - - diff --git a/build/Codecoverage.proj b/build/Codecoverage.proj index 03595f7ca8..faa9b15c53 100644 --- a/build/Codecoverage.proj +++ b/build/Codecoverage.proj @@ -16,8 +16,8 @@ <_ReportGeneratorPath>$(PkgReportGenerator)\tools\net47\ReportGenerator.exe - - + + <_CodecovArgs Include="-f;$(BaseOutputPath)$(PlatformConfig)\coverage\Cobertura.xml" /> diff --git a/build/Dependencies.props b/build/Dependencies.props index 903b984a40..9024a0d840 100644 --- a/build/Dependencies.props +++ b/build/Dependencies.props @@ -43,7 +43,7 @@ 1.0.0-beta-62824-02 1.9.0 - 1.2.1 + 2.7.0 4.3.6 1.0.0-beta.19225.5 diff --git a/test/Directory.Build.props b/test/Directory.Build.props index 14a05ed538..68034bc2eb 100644 --- a/test/Directory.Build.props +++ b/test/Directory.Build.props @@ -27,14 +27,14 @@ - + - + @@ -54,4 +54,22 @@ + + + true + + true + true + true + opencover + $(BaseOutputPath)$(PlatformConfig)\coverage\$(MSBuildProjectName).coverage + [Microsoft.ML.*]* + + [*]Microsoft.ML.*Contracts*,[*]Microsoft.ML.Internal.Utilities*,[*]Microsoft.ML.Data.VBuffer* + Obsolete,ExcludeFromCodeCoverage + $(RepoRoot)src\Microsoft.ML.OnnxConverter\OnnxMl.cs,$(RepoRoot)src\Microsoft.ML.TensorFlow\TensorFlow\Buffer.cs,$(RepoRoot)src\Microsoft.ML.TensorFlow\TensorFlow\Tensor.cs,$(RepoRoot)src\Microsoft.ML.TensorFlow\TensorFlow\Tensorflow.cs + + diff --git a/test/coverlet.runsettings b/test/coverlet.runsettings deleted file mode 100644 index 7d3ce8b280..0000000000 --- a/test/coverlet.runsettings +++ /dev/null @@ -1,18 +0,0 @@ - - - - - - - opencover - [*]Microsoft.ML.*Contracts*,[*]Microsoft.ML.Internal.Utilities*,[*]Microsoft.ML.Data.VBuffer* - [Microsoft.ML.*]* - Obsolete,ExcludeFromCodeCoverage - $(RepoRoot)src\Microsoft.ML.OnnxConverter\OnnxMl.cs,$(RepoRoot)src\Microsoft.ML.TensorFlow\TensorFlow\Buffer.cs,$(RepoRoot)src\Microsoft.ML.TensorFlow\TensorFlow\Tensor.cs,$(RepoRoot)src\Microsoft.ML.TensorFlow\TensorFlow\Tensorflow.cs - true - true - - - - - diff --git a/test/run-tests.proj b/test/run-tests.proj index 235f1402f2..ee827ef41c 100644 --- a/test/run-tests.proj +++ b/test/run-tests.proj @@ -8,12 +8,7 @@ - - From b1cc17646476a9d2d5778b885bab830708f98c8c Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Tue, 2 Jun 2020 21:14:14 -0700 Subject: [PATCH 09/21] Updated manifest --- .../Common/EntryPoints/core_manifest.json | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index 6a0c17a44f..c897ff4f9d 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -319,7 +319,7 @@ { "Name": "AllowQuoting", "Type": "Bool", - "Desc": "Whether the input may include quoted values, which can contain separator characters, colons, and distinguish empty values from missing values. When true, consecutive separators denote a missing value and an empty value is denoted by \"\". When false, consecutive separators denote an empty value.", + "Desc": "Whether the input may include double-quoted values. This parameter is used to distinguish separator characters in an input valuefrom actual separators. When true, separators within double quotes are treated as part of the input value. When false, allseparators, even those within quotes, are treated as delimiting a new column.", "Aliases": [ "quote" ], @@ -464,6 +464,18 @@ "SortOrder": 150.0, "IsNullable": false, "Default": "\"" + }, + { + "Name": "ImputeEmptyFloats", + "Type": "Bool", + "Desc": "If true, empty float fields will be loaded as NaN. If false, they'll be loaded as 0. Default is false.", + "Aliases": [ + "imputefloat" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false } ] }, From 65fcb59dae6625f7b25bf60a456f6320dd9ded74 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Tue, 2 Jun 2020 21:14:32 -0700 Subject: [PATCH 10/21] Revert "Revert "new code coverage (#5169)"" This reverts commit 684d359541eb2729c29af798d7aec1fc3040a4c9. --- build.proj | 13 +++++++++++++ build/Codecoverage.proj | 4 ++-- build/Dependencies.props | 2 +- test/Directory.Build.props | 22 ++-------------------- test/coverlet.runsettings | 18 ++++++++++++++++++ test/run-tests.proj | 7 ++++++- 6 files changed, 42 insertions(+), 24 deletions(-) create mode 100644 test/coverlet.runsettings diff --git a/build.proj b/build.proj index f7f3624679..1d95743b60 100644 --- a/build.proj +++ b/build.proj @@ -38,6 +38,7 @@ $(TraversalBuildDependsOn); DownloadExternalTestFiles; DownloadTensorflowMetaFiles; + DeleteTestHost; @@ -116,6 +117,18 @@ + + + + + + + + + + + diff --git a/build/Codecoverage.proj b/build/Codecoverage.proj index faa9b15c53..03595f7ca8 100644 --- a/build/Codecoverage.proj +++ b/build/Codecoverage.proj @@ -16,8 +16,8 @@ <_ReportGeneratorPath>$(PkgReportGenerator)\tools\net47\ReportGenerator.exe - - + + <_CodecovArgs Include="-f;$(BaseOutputPath)$(PlatformConfig)\coverage\Cobertura.xml" /> diff --git a/build/Dependencies.props b/build/Dependencies.props index 9024a0d840..903b984a40 100644 --- a/build/Dependencies.props +++ b/build/Dependencies.props @@ -43,7 +43,7 @@ 1.0.0-beta-62824-02 1.9.0 - 2.7.0 + 1.2.1 4.3.6 1.0.0-beta.19225.5 diff --git a/test/Directory.Build.props b/test/Directory.Build.props index 68034bc2eb..14a05ed538 100644 --- a/test/Directory.Build.props +++ b/test/Directory.Build.props @@ -27,14 +27,14 @@ - + - + @@ -54,22 +54,4 @@ - - - true - - true - true - true - opencover - $(BaseOutputPath)$(PlatformConfig)\coverage\$(MSBuildProjectName).coverage - [Microsoft.ML.*]* - - [*]Microsoft.ML.*Contracts*,[*]Microsoft.ML.Internal.Utilities*,[*]Microsoft.ML.Data.VBuffer* - Obsolete,ExcludeFromCodeCoverage - $(RepoRoot)src\Microsoft.ML.OnnxConverter\OnnxMl.cs,$(RepoRoot)src\Microsoft.ML.TensorFlow\TensorFlow\Buffer.cs,$(RepoRoot)src\Microsoft.ML.TensorFlow\TensorFlow\Tensor.cs,$(RepoRoot)src\Microsoft.ML.TensorFlow\TensorFlow\Tensorflow.cs - - diff --git a/test/coverlet.runsettings b/test/coverlet.runsettings new file mode 100644 index 0000000000..7d3ce8b280 --- /dev/null +++ b/test/coverlet.runsettings @@ -0,0 +1,18 @@ + + + + + + + opencover + [*]Microsoft.ML.*Contracts*,[*]Microsoft.ML.Internal.Utilities*,[*]Microsoft.ML.Data.VBuffer* + [Microsoft.ML.*]* + Obsolete,ExcludeFromCodeCoverage + $(RepoRoot)src\Microsoft.ML.OnnxConverter\OnnxMl.cs,$(RepoRoot)src\Microsoft.ML.TensorFlow\TensorFlow\Buffer.cs,$(RepoRoot)src\Microsoft.ML.TensorFlow\TensorFlow\Tensor.cs,$(RepoRoot)src\Microsoft.ML.TensorFlow\TensorFlow\Tensorflow.cs + true + true + + + + + diff --git a/test/run-tests.proj b/test/run-tests.proj index ee827ef41c..235f1402f2 100644 --- a/test/run-tests.proj +++ b/test/run-tests.proj @@ -8,7 +8,12 @@ - + From a68da2593d0b4894b8da6f7ea2894a306d739f23 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Tue, 2 Jun 2020 21:53:53 -0700 Subject: [PATCH 11/21] Minor fix in docs --- src/Microsoft.ML.Core/Utilities/DoubleParser.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Core/Utilities/DoubleParser.cs b/src/Microsoft.ML.Core/Utilities/DoubleParser.cs index dd13517731..9e35b7fb14 100644 --- a/src/Microsoft.ML.Core/Utilities/DoubleParser.cs +++ b/src/Microsoft.ML.Core/Utilities/DoubleParser.cs @@ -24,7 +24,7 @@ internal enum OptionFlags : uint // default behavior is to use "." as decimal marker. UseCommaAsDecimalMarker = 0x01, - // If this flag is set, then empty spans (or those with only spaces) + // If this flag is set, then empty spans (or those with only white-space) // will be parsed as NaN. If it isn't set, then default behavior // is to return them as 0. EmptyAsNaN = 0x02, From df32ba1082e75911fc4d22e84dd79484f1a4cc96 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Tue, 2 Jun 2020 22:34:08 -0700 Subject: [PATCH 12/21] Revert "Revert "Revert "new code coverage (#5169)""" This reverts commit 65fcb59dae6625f7b25bf60a456f6320dd9ded74. --- build.proj | 13 ------------- build/Codecoverage.proj | 4 ++-- build/Dependencies.props | 2 +- test/Directory.Build.props | 22 ++++++++++++++++++++-- test/coverlet.runsettings | 18 ------------------ test/run-tests.proj | 7 +------ 6 files changed, 24 insertions(+), 42 deletions(-) delete mode 100644 test/coverlet.runsettings diff --git a/build.proj b/build.proj index 1d95743b60..f7f3624679 100644 --- a/build.proj +++ b/build.proj @@ -38,7 +38,6 @@ $(TraversalBuildDependsOn); DownloadExternalTestFiles; DownloadTensorflowMetaFiles; - DeleteTestHost; @@ -117,18 +116,6 @@ - - - - - - - - - - - diff --git a/build/Codecoverage.proj b/build/Codecoverage.proj index 03595f7ca8..faa9b15c53 100644 --- a/build/Codecoverage.proj +++ b/build/Codecoverage.proj @@ -16,8 +16,8 @@ <_ReportGeneratorPath>$(PkgReportGenerator)\tools\net47\ReportGenerator.exe - - + + <_CodecovArgs Include="-f;$(BaseOutputPath)$(PlatformConfig)\coverage\Cobertura.xml" /> diff --git a/build/Dependencies.props b/build/Dependencies.props index 903b984a40..9024a0d840 100644 --- a/build/Dependencies.props +++ b/build/Dependencies.props @@ -43,7 +43,7 @@ 1.0.0-beta-62824-02 1.9.0 - 1.2.1 + 2.7.0 4.3.6 1.0.0-beta.19225.5 diff --git a/test/Directory.Build.props b/test/Directory.Build.props index 14a05ed538..68034bc2eb 100644 --- a/test/Directory.Build.props +++ b/test/Directory.Build.props @@ -27,14 +27,14 @@ - + - + @@ -54,4 +54,22 @@ + + + true + + true + true + true + opencover + $(BaseOutputPath)$(PlatformConfig)\coverage\$(MSBuildProjectName).coverage + [Microsoft.ML.*]* + + [*]Microsoft.ML.*Contracts*,[*]Microsoft.ML.Internal.Utilities*,[*]Microsoft.ML.Data.VBuffer* + Obsolete,ExcludeFromCodeCoverage + $(RepoRoot)src\Microsoft.ML.OnnxConverter\OnnxMl.cs,$(RepoRoot)src\Microsoft.ML.TensorFlow\TensorFlow\Buffer.cs,$(RepoRoot)src\Microsoft.ML.TensorFlow\TensorFlow\Tensor.cs,$(RepoRoot)src\Microsoft.ML.TensorFlow\TensorFlow\Tensorflow.cs + + diff --git a/test/coverlet.runsettings b/test/coverlet.runsettings deleted file mode 100644 index 7d3ce8b280..0000000000 --- a/test/coverlet.runsettings +++ /dev/null @@ -1,18 +0,0 @@ - - - - - - - opencover - [*]Microsoft.ML.*Contracts*,[*]Microsoft.ML.Internal.Utilities*,[*]Microsoft.ML.Data.VBuffer* - [Microsoft.ML.*]* - Obsolete,ExcludeFromCodeCoverage - $(RepoRoot)src\Microsoft.ML.OnnxConverter\OnnxMl.cs,$(RepoRoot)src\Microsoft.ML.TensorFlow\TensorFlow\Buffer.cs,$(RepoRoot)src\Microsoft.ML.TensorFlow\TensorFlow\Tensor.cs,$(RepoRoot)src\Microsoft.ML.TensorFlow\TensorFlow\Tensorflow.cs - true - true - - - - - diff --git a/test/run-tests.proj b/test/run-tests.proj index 235f1402f2..ee827ef41c 100644 --- a/test/run-tests.proj +++ b/test/run-tests.proj @@ -8,12 +8,7 @@ - - From f6912b1490ee157f4fa3de8b3503e568dac62c97 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Tue, 2 Jun 2020 22:40:30 -0700 Subject: [PATCH 13/21] Removed checking static instance in another test, because now it fails if it runs after the new test for the Impute option. --- test/Microsoft.ML.Tests/TextLoaderTests.cs | 117 +++++++++------------ 1 file changed, 49 insertions(+), 68 deletions(-) diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index 4d4bb84b1e..56dd550540 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -1040,87 +1040,68 @@ public void TestDifferentDecimalMarkersAtTheSameTime(bool useCorrectPeriod, bool DecimalMarker = ',' }; - for (int j = 0; j < 2; j++) - { - // Run various times inside the same test, to also test that TextLoader is only creating 1 - // Custom instance of ValueCreatorCache - IDataView dataViewPeriod; - IDataView dataViewComma; + IDataView dataViewPeriod; + IDataView dataViewComma; - if (useCorrectPeriod) - dataViewPeriod = mlContext.Data.LoadFromTextFile(periodPath, optionsPeriod); - else - dataViewPeriod = mlContext.Data.LoadFromTextFile(commaPath, optionsPeriod); + if (useCorrectPeriod) + dataViewPeriod = mlContext.Data.LoadFromTextFile(periodPath, optionsPeriod); + else + dataViewPeriod = mlContext.Data.LoadFromTextFile(commaPath, optionsPeriod); - if (useCorrectComma) - dataViewComma = mlContext.Data.LoadFromTextFile(commaPath, optionsComma); - else - dataViewComma = mlContext.Data.LoadFromTextFile(periodPath, optionsComma); + if (useCorrectComma) + dataViewComma = mlContext.Data.LoadFromTextFile(commaPath, optionsComma); + else + dataViewComma = mlContext.Data.LoadFromTextFile(periodPath, optionsComma); - VBuffer featuresPeriod = default; - VBuffer featuresComma = default; + VBuffer featuresPeriod = default; + VBuffer featuresComma = default; - using (var cursorPeriod = dataViewPeriod.GetRowCursor(dataViewPeriod.Schema)) - using (var cursorComma = dataViewComma.GetRowCursor(dataViewComma.Schema)) + using (var cursorPeriod = dataViewPeriod.GetRowCursor(dataViewPeriod.Schema)) + using (var cursorComma = dataViewComma.GetRowCursor(dataViewComma.Schema)) + { + var delegatePeriod = cursorPeriod.GetGetter>(dataViewPeriod.Schema["Features"]); + var delegateComma = cursorComma.GetGetter>(dataViewPeriod.Schema["Features"]); + while (cursorPeriod.MoveNext() && cursorComma.MoveNext()) { - var delegatePeriod = cursorPeriod.GetGetter>(dataViewPeriod.Schema["Features"]); - var delegateComma = cursorComma.GetGetter>(dataViewPeriod.Schema["Features"]); - while (cursorPeriod.MoveNext() && cursorComma.MoveNext()) - { - delegatePeriod(ref featuresPeriod); - delegateComma(ref featuresComma); + delegatePeriod(ref featuresPeriod); + delegateComma(ref featuresComma); - var featuresPeriodArray = featuresPeriod.GetValues().ToArray(); - var featuresCommaArray = featuresComma.GetValues().ToArray(); - Assert.Equal(featuresPeriodArray.Length, featuresCommaArray.Length); + var featuresPeriodArray = featuresPeriod.GetValues().ToArray(); + var featuresCommaArray = featuresComma.GetValues().ToArray(); + Assert.Equal(featuresPeriodArray.Length, featuresCommaArray.Length); - for (int i = 0; i < featuresPeriodArray.Length; i++) + for (int i = 0; i < featuresPeriodArray.Length; i++) + { + if (useCorrectPeriod && useCorrectComma) { - if (useCorrectPeriod && useCorrectComma) - { - // Check that none of the two files loadad NaNs - // As both of them should have been loaded correctly - Assert.Equal(featuresPeriodArray[i], featuresCommaArray[i]); - Assert.NotEqual(Single.NaN, featuresPeriodArray[i]); - } - else if (!useCorrectPeriod && !useCorrectComma) - { - // Check that everything was loaded as NaN - // Because the wrong decimal marker was used for both loaders - Assert.Equal(featuresPeriodArray[i], featuresCommaArray[i]); - Assert.Equal(Single.NaN, featuresPeriodArray[i]); - } - else if (!useCorrectPeriod && useCorrectComma) - { - // Check that only the file with commas was loaded correctly - Assert.Equal(Single.NaN, featuresPeriodArray[i]); - Assert.NotEqual(Single.NaN, featuresCommaArray[i]); - } - else - { - // Check that only the file with periods was loaded correctly - Assert.NotEqual(Single.NaN, featuresPeriodArray[i]); - Assert.Equal(Single.NaN, featuresCommaArray[i]); - } + // Check that none of the two files loadad NaNs + // As both of them should have been loaded correctly + Assert.Equal(featuresPeriodArray[i], featuresCommaArray[i]); + Assert.NotEqual(Single.NaN, featuresPeriodArray[i]); + } + else if (!useCorrectPeriod && !useCorrectComma) + { + // Check that everything was loaded as NaN + // Because the wrong decimal marker was used for both loaders + Assert.Equal(featuresPeriodArray[i], featuresCommaArray[i]); + Assert.Equal(Single.NaN, featuresPeriodArray[i]); + } + else if (!useCorrectPeriod && useCorrectComma) + { + // Check that only the file with commas was loaded correctly + Assert.Equal(Single.NaN, featuresPeriodArray[i]); + Assert.NotEqual(Single.NaN, featuresCommaArray[i]); + } + else + { + // Check that only the file with periods was loaded correctly + Assert.NotEqual(Single.NaN, featuresPeriodArray[i]); + Assert.Equal(Single.NaN, featuresCommaArray[i]); } } } - - // Check how many custom instances there are of TextLoader.ValueCreatorCache - var vccType = typeof(TextLoader).GetNestedType("ValueCreatorCache", BindingFlags.NonPublic | BindingFlags.Static); - var customInstancesInfo = vccType.GetField("_customInstances", BindingFlags.NonPublic | BindingFlags.Static); - var customInstancesObject = customInstancesInfo.GetValue(null); - var customInstancesCount = (int)customInstancesObject.GetType().GetProperty("Count").GetValue(customInstancesObject, null); - var customInstancesContainsMethod = customInstancesObject.GetType().GetMethod("ContainsKey"); - - // Regardless of useCorrectPeriod and useCorrectComma - // Since we always created a TextLoader with Comma as DecimalMarker - // There should always be 1, and only 1, custom instance of ValueCreatorCache, corresponding to the comma option - // Even after running multiple times the loop above. - Assert.Equal(1, customInstancesCount); - Assert.True((bool)customInstancesContainsMethod.Invoke(customInstancesObject, new[] { (object) DoubleParser.OptionFlags.UseCommaAsDecimalMarker })); } } From 3dd66ffd81feec7a2a84228d52ddf4393813e5ab Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Tue, 2 Jun 2020 22:40:58 -0700 Subject: [PATCH 14/21] Revert "Revert "Revert "Revert "new code coverage (#5169)"""" This reverts commit df32ba1082e75911fc4d22e84dd79484f1a4cc96. --- build.proj | 13 +++++++++++++ build/Codecoverage.proj | 4 ++-- build/Dependencies.props | 2 +- test/Directory.Build.props | 22 ++-------------------- test/coverlet.runsettings | 18 ++++++++++++++++++ test/run-tests.proj | 7 ++++++- 6 files changed, 42 insertions(+), 24 deletions(-) create mode 100644 test/coverlet.runsettings diff --git a/build.proj b/build.proj index f7f3624679..1d95743b60 100644 --- a/build.proj +++ b/build.proj @@ -38,6 +38,7 @@ $(TraversalBuildDependsOn); DownloadExternalTestFiles; DownloadTensorflowMetaFiles; + DeleteTestHost; @@ -116,6 +117,18 @@ + + + + + + + + + + + diff --git a/build/Codecoverage.proj b/build/Codecoverage.proj index faa9b15c53..03595f7ca8 100644 --- a/build/Codecoverage.proj +++ b/build/Codecoverage.proj @@ -16,8 +16,8 @@ <_ReportGeneratorPath>$(PkgReportGenerator)\tools\net47\ReportGenerator.exe - - + + <_CodecovArgs Include="-f;$(BaseOutputPath)$(PlatformConfig)\coverage\Cobertura.xml" /> diff --git a/build/Dependencies.props b/build/Dependencies.props index 9024a0d840..903b984a40 100644 --- a/build/Dependencies.props +++ b/build/Dependencies.props @@ -43,7 +43,7 @@ 1.0.0-beta-62824-02 1.9.0 - 2.7.0 + 1.2.1 4.3.6 1.0.0-beta.19225.5 diff --git a/test/Directory.Build.props b/test/Directory.Build.props index 68034bc2eb..14a05ed538 100644 --- a/test/Directory.Build.props +++ b/test/Directory.Build.props @@ -27,14 +27,14 @@ - + - + @@ -54,22 +54,4 @@ - - - true - - true - true - true - opencover - $(BaseOutputPath)$(PlatformConfig)\coverage\$(MSBuildProjectName).coverage - [Microsoft.ML.*]* - - [*]Microsoft.ML.*Contracts*,[*]Microsoft.ML.Internal.Utilities*,[*]Microsoft.ML.Data.VBuffer* - Obsolete,ExcludeFromCodeCoverage - $(RepoRoot)src\Microsoft.ML.OnnxConverter\OnnxMl.cs,$(RepoRoot)src\Microsoft.ML.TensorFlow\TensorFlow\Buffer.cs,$(RepoRoot)src\Microsoft.ML.TensorFlow\TensorFlow\Tensor.cs,$(RepoRoot)src\Microsoft.ML.TensorFlow\TensorFlow\Tensorflow.cs - - diff --git a/test/coverlet.runsettings b/test/coverlet.runsettings new file mode 100644 index 0000000000..7d3ce8b280 --- /dev/null +++ b/test/coverlet.runsettings @@ -0,0 +1,18 @@ + + + + + + + opencover + [*]Microsoft.ML.*Contracts*,[*]Microsoft.ML.Internal.Utilities*,[*]Microsoft.ML.Data.VBuffer* + [Microsoft.ML.*]* + Obsolete,ExcludeFromCodeCoverage + $(RepoRoot)src\Microsoft.ML.OnnxConverter\OnnxMl.cs,$(RepoRoot)src\Microsoft.ML.TensorFlow\TensorFlow\Buffer.cs,$(RepoRoot)src\Microsoft.ML.TensorFlow\TensorFlow\Tensor.cs,$(RepoRoot)src\Microsoft.ML.TensorFlow\TensorFlow\Tensorflow.cs + true + true + + + + + diff --git a/test/run-tests.proj b/test/run-tests.proj index ee827ef41c..235f1402f2 100644 --- a/test/run-tests.proj +++ b/test/run-tests.proj @@ -8,7 +8,12 @@ - + From 280a35e29aec11bf2dff86aadb422674fe2bae34 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Mon, 8 Jun 2020 09:12:51 -0700 Subject: [PATCH 15/21] Bumped version to include new flag --- .../DataLoadSave/Text/TextLoader.cs | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index 23fe7862ba..76cb6c2afa 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -1084,7 +1084,8 @@ private static VersionInfo GetVersionInfo() //verWrittenCur: 0x0001000A, // Added ForceVector in Range //verWrittenCur: 0x0001000B, // Header now retained if used and present //verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags - verWrittenCur: 0x0001000D, // Added escapeChar and decimalMarker chars and imputeEmptyFloats flag + //verWrittenCur: 0x0001000D, // Added escapeChar and decimalMarker chars + verWrittenCur: 0x0001000E, // Added imputeEmptyFloats flag verReadableCur: 0x0001000A, verWeCanReadBack: 0x00010009, loaderSignature: LoaderSignature, @@ -1416,7 +1417,25 @@ private TextLoader(IHost host, ModelLoadContext ctx) _maxRows = ctx.Reader.ReadInt64(); host.CheckDecode(_maxRows > 0); _flags = (OptionFlags)ctx.Reader.ReadUInt32(); - host.CheckDecode((_flags & ~OptionFlags.All) == 0); + + // Flags introduced with the first ML.NET commit: + var acceptableFlags = OptionFlags.TrimWhitespace; + acceptableFlags |= OptionFlags.HasHeader; + acceptableFlags |= OptionFlags.AllowQuoting; + acceptableFlags |= OptionFlags.AllowSparse; + + // Flags added on later versions of TextLoader: + if(ctx.Header.ModelVerWritten >= 0x0001000C) + { + acceptableFlags |= OptionFlags.ReadMultilines; + } + if(ctx.Header.ModelVerWritten >= 0x0001000E) + { + acceptableFlags |= OptionFlags.ImputeEmptyFloats; + } + + host.CheckDecode((_flags & ~acceptableFlags) == 0); + _inputSize = ctx.Reader.ReadInt32(); host.CheckDecode(0 <= _inputSize && _inputSize < SrcLim); From 8e05dff23a407db48f80a8e79cedad2935621821 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Mon, 8 Jun 2020 16:23:01 -0700 Subject: [PATCH 16/21] Load missing columns also as NaNs --- .../DataLoadSave/Text/TextLoaderParser.cs | 38 ++++++++++++++++++- .../TextLoader/missing_fields-with-impute.csv | 7 ++-- 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs index 406a0bd989..147608f574 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs @@ -228,6 +228,8 @@ private abstract class ColumnPipe public abstract bool HasNA { get; } + public abstract bool IsReal { get; } // If the type of the ColumnPipe is either Single or Double + protected ColumnPipe(RowSet rows) { Contracts.AssertValue(rows); @@ -251,6 +253,8 @@ private sealed class PrimitivePipe : ColumnPipe public override bool HasNA { get; } + public override bool IsReal { get; } + public PrimitivePipe(RowSet rows, PrimitiveDataViewType type, TryParseMapper conv) : base(rows) { @@ -259,6 +263,7 @@ public PrimitivePipe(RowSet rows, PrimitiveDataViewType type, TryParseMapper : ColumnPipe public override bool HasNA { get; } + public override bool IsReal { get; } + private class VectorValue { private readonly VectorPipe _pipe; @@ -441,6 +448,7 @@ public VectorPipe(RowSet rows, PrimitiveDataViewType type, TryParseMapper for (int i = 0; i < _values.Length; i++) _values[i] = new VectorValue(this); HasNA = Conversions.DefaultInstance.TryGetIsNAPredicate(type, out var del); + IsReal = typeof(TItem) == typeof(Single) || typeof(TItem) == typeof(Double); } public override void Reset(int irow, int size) @@ -649,6 +657,7 @@ public void Clear() private readonly char[] _separators; private readonly OptionFlags _flags; + private readonly bool _imputeReals; private readonly char _escapeChar; private readonly int _inputSize; private readonly ColInfo[] _infos; @@ -659,6 +668,8 @@ public void Clear() private volatile int _csrc; private volatile int _mismatchCount; + private ReadOnlyMemory _blank; + public Parser(TextLoader parent) { Contracts.AssertValue(parent); @@ -715,6 +726,8 @@ public Parser(TextLoader parent) _flags = parent._flags; _escapeChar = parent._escapeChar; _inputSize = parent._inputSize; + _imputeReals = (parent._flags & OptionFlags.ImputeEmptyFloats) != 0; + _blank = ReadOnlyMemory.Empty; Contracts.Assert(_inputSize >= 0); } @@ -1388,6 +1401,7 @@ private void ProcessVec(int srcLim, FieldSet fields, ColInfo info, ColumnPipe v, Contracts.Assert(sizeVar >= 0); int size = checked(info.SizeBase + sizeVar); + var xx = fields.Spans[0].ToString(); v.Reset(irow, size); int ivDst = 0; for (int i = 0; i < info.Segments.Length; i++) @@ -1405,10 +1419,10 @@ private void ProcessVec(int srcLim, FieldSet fields, ColInfo info, ColumnPipe v, int sizeSeg = lim - min; Contracts.Assert(ivDst <= size - sizeSeg); + int indexBase = ivDst - min; int isrc = fields.Indices.FindIndexSorted(0, fields.Count, min); if (isrc < fields.Count && fields.Indices[isrc] < lim) { - int indexBase = ivDst - min; int isrcLim = fields.Indices.FindIndexSorted(isrc, fields.Count, lim); Contracts.Assert(isrc < isrcLim); for (; isrc < isrcLim; isrc++) @@ -1423,6 +1437,19 @@ private void ProcessVec(int srcLim, FieldSet fields, ColInfo info, ColumnPipe v, } } } + + if(_imputeReals && isrc >= fields.Count && v.IsReal) + { + // If the user has set the EmptyRealsAsNan option to true, + // And there are missing columns on a given row, + // then we should load them as if they were empty (i.e. _blank) fields + // So that they can be loaded as NaNs if they're single/double columns + // Or as default if they aren't. + for (int srcCur = Math.Max(min, fields.Count); srcCur < lim; srcCur++) + { + v.Consume(irow, indexBase + srcCur, ref _blank); + } + } ivDst += sizeSeg; } Contracts.Assert(ivDst == size); @@ -1445,6 +1472,15 @@ private void ProcessOne(FieldSet vs, ColInfo info, ColumnPipe v, int irow, long v.Rows.Stats.LogBadValue(line, info.Name); } } + else if(_imputeReals && v.IsReal) + { + // If the user has set the EmptyRealsAsNan option to true, + // And there are missing columns on a given row, + // then we should load them as if they were empty (i.e. _blank) fields + // So that they can be loaded as NaNs if they're single/double columns + // Or as default if they aren't. + v.Consume(irow, 0, ref _blank); + } else v.Reset(irow, 0); } diff --git a/test/BaselineOutput/Common/TextLoader/missing_fields-with-impute.csv b/test/BaselineOutput/Common/TextLoader/missing_fields-with-impute.csv index fdf3522f51..b9868d508d 100644 --- a/test/BaselineOutput/Common/TextLoader/missing_fields-with-impute.csv +++ b/test/BaselineOutput/Common/TextLoader/missing_fields-with-impute.csv @@ -7,10 +7,9 @@ int,description,num1,num2,date,num3,num4 3,"this has an empty quoted num1 and a quoted space in num3",NaN,33.33,3/3/2003,NaN,3333.3333 4,"this has a space in num2 and a space in num4",4.4,NaN,4/4/2004,444.444,NaN 5,"this has a quoted space num2 and quoted space in num4",5.5,NaN,5/5/2005,555.555,NaN -// The next two rows map the missing columns as 0, as it was decided not to impute with NaN -// in this case -6,"this has no date, num3 or num4 (the separator corresponding to them is also missing)",6.6,66.66,1/1/0001,0,0 -7,"this has no num4 (the separator corresponding to it is missing)",7.7,77.77,7/7/2007,777.777,0 +// The next two rows map the missing columns as NaN, as it was decided to impute them as well +6,"this has no date, num3 or num4 (the separator corresponding to them is also missing)",6.6,66.66,1/1/0001,NaN,NaN +7,"this has no num4 (the separator corresponding to it is missing)",7.7,77.77,7/7/2007,777.777,NaN // In the next case we do impute with NaN because the separator is there 8,"this has nothing in num4, but includes the last separator",8.8,88.88,8/8/2008,888.888,NaN 9,,9.9,99.99,9/9/2009,999.999,NaN From 1cc7eff93d5739efbdc9e9ed40ecd955643015e0 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Mon, 8 Jun 2020 16:42:52 -0700 Subject: [PATCH 17/21] Change name to MissingRealsAsNaNs --- .../DataLoadSave/Text/TextLoader.cs | 23 +++++++++++-------- .../DataLoadSave/Text/TextLoaderParser.cs | 20 ++++++++-------- 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index 76cb6c2afa..52f27acc1e 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -533,10 +533,13 @@ public class Options public char EscapeChar = Defaults.EscapeChar; /// - /// If true, empty float fields will be loaded as NaN. If false, they'll be loaded as 0. Default is false. + /// If true, missing real fields (i.e. double or single fields) will be loaded as NaN. + /// If false, they'll be loaded as 0. Default is false. + /// A field is considered "missing" if it's empty, if it only has whitespace, or if there are missing columns + /// at the end of a given row. /// - [Argument(ArgumentType.AtMostOnce, HelpText = "If true, empty float fields will be loaded as NaN. If false, they'll be loaded as 0. Default is false.", ShortName = "imputefloat")] - public bool ImputeEmptyFloats = Defaults.ImputeEmptyFloats; + [Argument(ArgumentType.AtMostOnce, HelpText = "If true, empty float fields will be loaded as NaN. If false, they'll be loaded as 0. Default is false.", ShortName = "missingrealnan")] + public bool MissingRealsAsNaNs = Defaults.MissingRealsAsNaNs; /// /// Checks that all column specifications are valid (that is, ranges are disjoint and have min<=max). @@ -557,7 +560,7 @@ internal static class Defaults internal const bool TrimWhitespace = false; internal const bool ReadMultilines = false; internal const char EscapeChar = '"'; - internal const bool ImputeEmptyFloats = false; + internal const bool MissingRealsAsNaNs = false; } /// @@ -1085,7 +1088,7 @@ private static VersionInfo GetVersionInfo() //verWrittenCur: 0x0001000B, // Header now retained if used and present //verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags //verWrittenCur: 0x0001000D, // Added escapeChar and decimalMarker chars - verWrittenCur: 0x0001000E, // Added imputeEmptyFloats flag + verWrittenCur: 0x0001000E, // Added MissingRealsAsNaNs flag verReadableCur: 0x0001000A, verWeCanReadBack: 0x00010009, loaderSignature: LoaderSignature, @@ -1104,8 +1107,8 @@ private enum OptionFlags : uint AllowQuoting = 0x04, AllowSparse = 0x08, ReadMultilines = 0x10, - ImputeEmptyFloats = 0x20, - All = TrimWhitespace | HasHeader | AllowQuoting | AllowSparse | ReadMultilines | ImputeEmptyFloats + MissingRealsAsNaNs = 0x20, + All = TrimWhitespace | HasHeader | AllowQuoting | AllowSparse | ReadMultilines | MissingRealsAsNaNs } // This is reserved to mean the range extends to the end (the segment is variable). @@ -1187,8 +1190,8 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo _flags |= OptionFlags.AllowSparse; if (options.AllowQuoting && options.ReadMultilines) _flags |= OptionFlags.ReadMultilines; - if (options.ImputeEmptyFloats) - _flags |= OptionFlags.ImputeEmptyFloats; + if (options.MissingRealsAsNaNs) + _flags |= OptionFlags.MissingRealsAsNaNs; // REVIEW: This should be persisted (if it should be maintained). _maxRows = options.MaxRows ?? long.MaxValue; @@ -1431,7 +1434,7 @@ private TextLoader(IHost host, ModelLoadContext ctx) } if(ctx.Header.ModelVerWritten >= 0x0001000E) { - acceptableFlags |= OptionFlags.ImputeEmptyFloats; + acceptableFlags |= OptionFlags.MissingRealsAsNaNs; } host.CheckDecode((_flags & ~acceptableFlags) == 0); diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs index 147608f574..dbc54a2998 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs @@ -657,7 +657,7 @@ public void Clear() private readonly char[] _separators; private readonly OptionFlags _flags; - private readonly bool _imputeReals; + private readonly bool _missingRealsAsNaNs; private readonly char _escapeChar; private readonly int _inputSize; private readonly ColInfo[] _infos; @@ -682,7 +682,7 @@ public Parser(TextLoader parent) var doubleParserOptionFlags = DoubleParser.OptionFlags.Default; if (parent._decimalMarker == ',') doubleParserOptionFlags |= DoubleParser.OptionFlags.UseCommaAsDecimalMarker; - if ((parent._flags & OptionFlags.ImputeEmptyFloats) != 0) + if ((parent._flags & OptionFlags.MissingRealsAsNaNs) != 0) doubleParserOptionFlags |= DoubleParser.OptionFlags.EmptyAsNaN; if (doubleParserOptionFlags == DoubleParser.OptionFlags.Default) @@ -726,7 +726,7 @@ public Parser(TextLoader parent) _flags = parent._flags; _escapeChar = parent._escapeChar; _inputSize = parent._inputSize; - _imputeReals = (parent._flags & OptionFlags.ImputeEmptyFloats) != 0; + _missingRealsAsNaNs = (parent._flags & OptionFlags.MissingRealsAsNaNs) != 0; _blank = ReadOnlyMemory.Empty; Contracts.Assert(_inputSize >= 0); } @@ -947,10 +947,10 @@ public HelperImpl(ParseStats stats, OptionFlags flags, char[] seps, char escapeC _blank = ReadOnlyMemory.Empty; Fields = new FieldSet(); - // If we want to impute empty float fields, then we must keep - // all empty fields spans, as there's no way for the Parser.HelperImpl + // If we want to impute empty real fields as NaNs, then we must keep + // all empty field spans, as there's no way for the Parser.HelperImpl // to know beforehand which fields belong to a float field - _keepEmpty = (flags & OptionFlags.ImputeEmptyFloats) != 0; + _keepEmpty = (flags & OptionFlags.MissingRealsAsNaNs) != 0; } /// @@ -1438,9 +1438,9 @@ private void ProcessVec(int srcLim, FieldSet fields, ColInfo info, ColumnPipe v, } } - if(_imputeReals && isrc >= fields.Count && v.IsReal) + if(_missingRealsAsNaNs && isrc >= fields.Count && v.IsReal) { - // If the user has set the EmptyRealsAsNan option to true, + // If the user has set the MissingRealsAsNaNs option to true, // And there are missing columns on a given row, // then we should load them as if they were empty (i.e. _blank) fields // So that they can be loaded as NaNs if they're single/double columns @@ -1472,9 +1472,9 @@ private void ProcessOne(FieldSet vs, ColInfo info, ColumnPipe v, int irow, long v.Rows.Stats.LogBadValue(line, info.Name); } } - else if(_imputeReals && v.IsReal) + else if(_missingRealsAsNaNs && v.IsReal) { - // If the user has set the EmptyRealsAsNan option to true, + // If the user has set the MissingRealsAsNaNs option to true, // And there are missing columns on a given row, // then we should load them as if they were empty (i.e. _blank) fields // So that they can be loaded as NaNs if they're single/double columns From b0744125c9347fcd556b21f9b977163c8d99f678 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Mon, 8 Jun 2020 17:43:14 -0700 Subject: [PATCH 18/21] Added MAML Test --- ...TextLoaderWithMissingRealsAsNaNs-1-out.txt | 27 +++++++++++++++++++ ...peTextLoaderWithMissingRealsAsNaNs-out.txt | 27 +++++++++++++++++++ .../TestCommandBase.cs | 20 ++++++++++++++ 3 files changed, 74 insertions(+) create mode 100644 test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMissingRealsAsNaNs-1-out.txt create mode 100644 test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMissingRealsAsNaNs-out.txt diff --git a/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMissingRealsAsNaNs-1-out.txt b/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMissingRealsAsNaNs-1-out.txt new file mode 100644 index 0000000000..20e86e3ad1 --- /dev/null +++ b/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMissingRealsAsNaNs-1-out.txt @@ -0,0 +1,27 @@ +#@ TextLoader{ +#@ header+ +#@ sep=tab +#@ col=id:R4:0 +#@ col=description:TX:1 +#@ col=date:DT:2 +#@ col=sing1:R4:3 +#@ col=sing2:R4:4 +#@ col=singFt1:R4:5-6 +#@ col=doubFt:R8:7-10 +#@ } +id description date sing1 sing2 num1 num2 num1 num2 num3 num4 +0 this is a description "2001-01-01T%Time%Z" 0.12 0.34 0.12 0.34 0.12 0.34000000000000002 0.56000000000000005 0.78000000000000003 +? this has an empty int and date "0001-01-01T%Time%" 1.1 11.11 1.1 11.11 1.1000000000000001 11.109999999999999 111.111 1111.1111100000001 +? this has a quoted empty int and date "0001-01-01T%Time%" 1.1 11.11 1.1 11.11 1.1000000000000001 11.109999999999999 111.111 1111.1111100000001 +1 this has a quoted int and date "2001-01-01T%Time%Z" 1.1 11.11 1.1 11.11 1.1000000000000001 11.109999999999999 111.111 1111.1111100000001 +2 this has an empty num1 and a space in num3 "2002-02-02T%Time%Z" ? 22.22 ? 22.22 ? 22.219999999999999 ? 2222.2222000000002 +3 this has an empty quoted num1 and a quoted space in num3 "2003-03-03T%Time%Z" ? 33.33 ? 33.33 ? 33.329999999999998 ? 3333.3332999999998 +4 this has a space in num2 and a space in num4 "2004-04-04T%Time%Z" 4.4 ? 4.4 ? 4.4000000000000004 ? 444.44400000000002 ? +5 this has a quoted space num2 and quoted space in num4 "2005-05-05T%Time%Z" 5.5 ? 5.5 ? 5.5 ? 555.55499999999995 ? +6 this has no date, num3 or num4 (the separator corresponding to them is also missing) "0001-01-01T%Time%" 6.6 66.66 6.6 66.66 6.5999999999999996 66.659999999999997 ? ? +7 this has no num4 (the separator corresponding to it is missing) "2007-07-07T%Time%Z" 7.7 77.77 7.7 77.77 7.7000000000000002 77.769999999999996 777.77700000000004 ? +8 this has nothing in num4, but includes the last separator "2008-08-08T%Time%Z" 8.8 88.88 8.8 88.88 8.8000000000000007 88.879999999999995 888.88800000000003 ? +9 "" "2009-09-09T%Time%Z" 9.9 99.99 9.9 99.99 9.9000000000000004 99.989999999999995 999.99900000000002 ? +? "" "2010-10-10T%Time%Z" 10.1 ? 10.1 ? 10.1 ? 101010.10101 ? +11 NaN "2011-11-11T%Time%Z" ? ? ? ? ? ? ? Infinity +Wrote 14 rows of length 11 \ No newline at end of file diff --git a/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMissingRealsAsNaNs-out.txt b/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMissingRealsAsNaNs-out.txt new file mode 100644 index 0000000000..20e86e3ad1 --- /dev/null +++ b/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMissingRealsAsNaNs-out.txt @@ -0,0 +1,27 @@ +#@ TextLoader{ +#@ header+ +#@ sep=tab +#@ col=id:R4:0 +#@ col=description:TX:1 +#@ col=date:DT:2 +#@ col=sing1:R4:3 +#@ col=sing2:R4:4 +#@ col=singFt1:R4:5-6 +#@ col=doubFt:R8:7-10 +#@ } +id description date sing1 sing2 num1 num2 num1 num2 num3 num4 +0 this is a description "2001-01-01T%Time%Z" 0.12 0.34 0.12 0.34 0.12 0.34000000000000002 0.56000000000000005 0.78000000000000003 +? this has an empty int and date "0001-01-01T%Time%" 1.1 11.11 1.1 11.11 1.1000000000000001 11.109999999999999 111.111 1111.1111100000001 +? this has a quoted empty int and date "0001-01-01T%Time%" 1.1 11.11 1.1 11.11 1.1000000000000001 11.109999999999999 111.111 1111.1111100000001 +1 this has a quoted int and date "2001-01-01T%Time%Z" 1.1 11.11 1.1 11.11 1.1000000000000001 11.109999999999999 111.111 1111.1111100000001 +2 this has an empty num1 and a space in num3 "2002-02-02T%Time%Z" ? 22.22 ? 22.22 ? 22.219999999999999 ? 2222.2222000000002 +3 this has an empty quoted num1 and a quoted space in num3 "2003-03-03T%Time%Z" ? 33.33 ? 33.33 ? 33.329999999999998 ? 3333.3332999999998 +4 this has a space in num2 and a space in num4 "2004-04-04T%Time%Z" 4.4 ? 4.4 ? 4.4000000000000004 ? 444.44400000000002 ? +5 this has a quoted space num2 and quoted space in num4 "2005-05-05T%Time%Z" 5.5 ? 5.5 ? 5.5 ? 555.55499999999995 ? +6 this has no date, num3 or num4 (the separator corresponding to them is also missing) "0001-01-01T%Time%" 6.6 66.66 6.6 66.66 6.5999999999999996 66.659999999999997 ? ? +7 this has no num4 (the separator corresponding to it is missing) "2007-07-07T%Time%Z" 7.7 77.77 7.7 77.77 7.7000000000000002 77.769999999999996 777.77700000000004 ? +8 this has nothing in num4, but includes the last separator "2008-08-08T%Time%Z" 8.8 88.88 8.8 88.88 8.8000000000000007 88.879999999999995 888.88800000000003 ? +9 "" "2009-09-09T%Time%Z" 9.9 99.99 9.9 99.99 9.9000000000000004 99.989999999999995 999.99900000000002 ? +? "" "2010-10-10T%Time%Z" 10.1 ? 10.1 ? 10.1 ? 101010.10101 ? +11 NaN "2011-11-11T%Time%Z" ? ? ? ? ? ? ? Infinity +Wrote 14 rows of length 11 \ No newline at end of file diff --git a/test/Microsoft.ML.TestFramework/TestCommandBase.cs b/test/Microsoft.ML.TestFramework/TestCommandBase.cs index c1d8c941a2..332ed22a17 100644 --- a/test/Microsoft.ML.TestFramework/TestCommandBase.cs +++ b/test/Microsoft.ML.TestFramework/TestCommandBase.cs @@ -2167,6 +2167,26 @@ public void SavePipeTextLoaderWithMultilines() Done(); } + [TestCategory("DataPipeSerialization")] + [Fact()] + public void SavePipeTextLoaderWithMissingRealsAsNaNs() + { + string dataPath = GetDataPath("missing_fields.csv"); + const string loaderArgs = "loader=text{sep=, quote+ multilines+ header+ escapechar=\\ missingrealnan+ " + + "col=id:Num:0 col=description:TX:1 col=date:DT:4 " + + "col=sing1:R4:2 col=sing2:R4:3 col=singFt1:R4:2-3 " + + "col=doubFt:R8:2-3,5-6}"; + + OutputPath modelPath = ModelPath(); + string extraArgs = null; + TestCore("showdata", dataPath, loaderArgs, extraArgs); + + _step++; + + TestCore("showdata", dataPath, string.Format("in={{{0}}}", modelPath.Path), ""); + Done(); + } + [TestCategory("DataPipeSerialization")] [Fact()] public void SavePipeChooseColumnsByIndexDrop() From 19241ef43efedfbacc06d18eb843c495bf6a1b9d Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Mon, 8 Jun 2020 17:44:07 -0700 Subject: [PATCH 19/21] Updated test --- test/Microsoft.ML.Tests/TextLoaderTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index 56dd550540..b42f4c54f7 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -1438,7 +1438,7 @@ public void TestLoadTextWithEmptyFloat(bool useImputeEmptyFloats) if(useImputeEmptyFloats) { baselineDV = mlContext.Data.LoadFromTextFile(baselineWithImpute, options); - options.ImputeEmptyFloats = true; + options.MissingRealsAsNaNs = true; testDV = mlContext.Data.LoadFromTextFile(inputPath, options); } else From 00a972b824c342db5cb8d966ec973b780bb84af9 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Mon, 8 Jun 2020 17:50:45 -0700 Subject: [PATCH 20/21] Updated manifest and fixed typo --- src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs | 4 ++-- test/BaselineOutput/Common/EntryPoints/core_manifest.json | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index 52f27acc1e..117cc6e075 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -433,8 +433,8 @@ public class Options /// [Argument(ArgumentType.AtMostOnce, HelpText = - "Whether the input may include double-quoted values. This parameter is used to distinguish separator characters in an input value" + - "from actual separators. When true, separators within double quotes are treated as part of the input value. When false, all" + + "Whether the input may include double-quoted values. This parameter is used to distinguish separator characters in an input value " + + "from actual separators. When true, separators within double quotes are treated as part of the input value. When false, all " + "separators, even those within quotes, are treated as delimiting a new column.", ShortName = "quote")] public bool AllowQuoting = Defaults.AllowQuoting; diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index c897ff4f9d..88d3047d1c 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -319,7 +319,7 @@ { "Name": "AllowQuoting", "Type": "Bool", - "Desc": "Whether the input may include double-quoted values. This parameter is used to distinguish separator characters in an input valuefrom actual separators. When true, separators within double quotes are treated as part of the input value. When false, allseparators, even those within quotes, are treated as delimiting a new column.", + "Desc": "Whether the input may include double-quoted values. This parameter is used to distinguish separator characters in an input value from actual separators. When true, separators within double quotes are treated as part of the input value. When false, all separators, even those within quotes, are treated as delimiting a new column.", "Aliases": [ "quote" ], @@ -466,11 +466,11 @@ "Default": "\"" }, { - "Name": "ImputeEmptyFloats", + "Name": "MissingRealsAsNaNs", "Type": "Bool", "Desc": "If true, empty float fields will be loaded as NaN. If false, they'll be loaded as 0. Default is false.", "Aliases": [ - "imputefloat" + "missingrealnan" ], "Required": false, "SortOrder": 150.0, From a4172283f2376037b36b738ddb703f3ba284babb Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Mon, 8 Jun 2020 18:00:22 -0700 Subject: [PATCH 21/21] Removed unused variable --- src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs index dbc54a2998..33dba6a843 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs @@ -1401,7 +1401,6 @@ private void ProcessVec(int srcLim, FieldSet fields, ColInfo info, ColumnPipe v, Contracts.Assert(sizeVar >= 0); int size = checked(info.SizeBase + sizeVar); - var xx = fields.Spans[0].ToString(); v.Reset(irow, size); int ivDst = 0; for (int i = 0; i < info.Segments.Length; i++)