diff --git a/docs/code/IDataViewTypeSystem.md b/docs/code/IDataViewTypeSystem.md index cabf3dd54e..401c9b1cac 100644 --- a/docs/code/IDataViewTypeSystem.md +++ b/docs/code/IDataViewTypeSystem.md @@ -540,7 +540,10 @@ is first processed entirely as `TX` values, then parsed, or processed directly into numeric values, that is, parsing as the row is processed. In the latter case, it is simple to map implicit items (suppressed due to sparsity) to zero. In the former case, these items are first mapped to the empty text value. To -get the same result, we need empty text to map to zero. +get the same result, we need empty text to map to zero. An exception to this +rule has been permitted in the TextLoader, where there's an option to load +empty `TX` fields as `NaN` for `R4` and `R8` fields, instead of using the default +conversion of empty `TX` to the numeric default `0`. ### To Text diff --git a/src/Microsoft.ML.Core/Utilities/DoubleParser.cs b/src/Microsoft.ML.Core/Utilities/DoubleParser.cs index 58ee99f5ef..9e35b7fb14 100644 --- a/src/Microsoft.ML.Core/Utilities/DoubleParser.cs +++ b/src/Microsoft.ML.Core/Utilities/DoubleParser.cs @@ -23,6 +23,11 @@ internal enum OptionFlags : uint // a number and its decimal part). If this isn't set, then // default behavior is to use "." as decimal marker. UseCommaAsDecimalMarker = 0x01, + + // If this flag is set, then empty spans (or those with only white-space) + // will be parsed as NaN. If it isn't set, then default behavior + // is to return them as 0. + EmptyAsNaN = 0x02, } private const ulong TopBit = 0x8000000000000000UL; @@ -81,22 +86,22 @@ public enum Result } /// - /// This produces zero for an empty string. + /// This produces zero for an empty string, or NaN depending on the used. /// public static bool TryParse(ReadOnlySpan span, out Single value, OptionFlags flags = OptionFlags.Default) { var res = Parse(span, out value, flags); - Contracts.Assert(res != Result.Empty || value == 0); + Contracts.Assert(res != Result.Empty || ((flags & OptionFlags.EmptyAsNaN) == 0 && value == 0) || Single.IsNaN(value)); return res <= Result.Empty; } /// - /// This produces zero for an empty string. + /// This produces zero for an empty string, or NaN depending on the used. /// public static bool TryParse(ReadOnlySpan span, out Double value, OptionFlags flags = OptionFlags.Default) { var res = Parse(span, out value, flags); - Contracts.Assert(res != Result.Empty || value == 0); + Contracts.Assert(res != Result.Empty || ((flags & OptionFlags.EmptyAsNaN) == 0 && value == 0) || Double.IsNaN(value)); return res <= Result.Empty; } @@ -107,7 +112,11 @@ public static Result Parse(ReadOnlySpan span, out Single value, OptionFlag { if (ich >= span.Length) { - value = 0; + if ((flags & OptionFlags.EmptyAsNaN) == 0) + value = 0; + else + value = Single.NaN; + return Result.Empty; } if (!char.IsWhiteSpace(span[ich])) @@ -155,7 +164,11 @@ public static Result Parse(ReadOnlySpan span, out Double value, OptionFlag { if (ich >= span.Length) { - value = 0; + if ((flags & OptionFlags.EmptyAsNaN) == 0) + value = 0; + else + value = Double.NaN; + return Result.Empty; } if (!char.IsWhiteSpace(span[ich])) diff --git a/src/Microsoft.ML.Data/Data/Conversion.cs b/src/Microsoft.ML.Data/Data/Conversion.cs index bdc4f8a409..cf48c3f295 100644 --- a/src/Microsoft.ML.Data/Data/Conversion.cs +++ b/src/Microsoft.ML.Data/Data/Conversion.cs @@ -1369,7 +1369,8 @@ private void TryParseSigned(long max, in TX text, out long? result) } /// - /// This produces zero for empty. It returns false if the text is not parsable. + /// This produces zero for empty, or NaN depending on the used. + /// It returns false if the text is not parsable. /// On failure, it sets dst to the NA value. /// public bool TryParse(in TX src, out R4 dst) @@ -1382,7 +1383,8 @@ public bool TryParse(in TX src, out R4 dst) } /// - /// This produces zero for empty. It returns false if the text is not parsable. + /// This produces zero for empty, or NaN depending on the used. + /// It returns false if the text is not parsable. /// On failure, it sets dst to the NA value. /// public bool TryParse(in TX src, out R8 dst) @@ -1394,6 +1396,9 @@ public bool TryParse(in TX src, out R8 dst) return IsStdMissing(ref span); } + /// + /// This produces default for empty. + /// public bool TryParse(in TX src, out TS dst) { if (src.IsEmpty) @@ -1408,6 +1413,9 @@ public bool TryParse(in TX src, out TS dst) return false; } + /// + /// This produces default for empty. + /// public bool TryParse(in TX src, out DT dst) { if (src.IsEmpty) @@ -1422,6 +1430,9 @@ public bool TryParse(in TX src, out DT dst) return false; } + /// + /// This produces default for empty. + /// public bool TryParse(in TX src, out DZ dst) { if (src.IsEmpty) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index 2cd1e8e8cf..117cc6e075 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -433,10 +433,9 @@ public class Options /// [Argument(ArgumentType.AtMostOnce, HelpText = - "Whether the input may include quoted values, which can contain separator characters, colons," + - " and distinguish empty values from missing values. When true, consecutive separators denote a" + - " missing value and an empty value is denoted by \"\". When false, consecutive separators" + - " denote an empty value.", + "Whether the input may include double-quoted values. This parameter is used to distinguish separator characters in an input value " + + "from actual separators. When true, separators within double quotes are treated as part of the input value. When false, all " + + "separators, even those within quotes, are treated as delimiting a new column.", ShortName = "quote")] public bool AllowQuoting = Defaults.AllowQuoting; @@ -533,6 +532,15 @@ public class Options [Argument(ArgumentType.AtMostOnce, HelpText = "Character to use to escape quotes inside quoted fields. It can't be a character used as separator.", ShortName = "escapechar")] public char EscapeChar = Defaults.EscapeChar; + /// + /// If true, missing real fields (i.e. double or single fields) will be loaded as NaN. + /// If false, they'll be loaded as 0. Default is false. + /// A field is considered "missing" if it's empty, if it only has whitespace, or if there are missing columns + /// at the end of a given row. + /// + [Argument(ArgumentType.AtMostOnce, HelpText = "If true, empty float fields will be loaded as NaN. If false, they'll be loaded as 0. Default is false.", ShortName = "missingrealnan")] + public bool MissingRealsAsNaNs = Defaults.MissingRealsAsNaNs; + /// /// Checks that all column specifications are valid (that is, ranges are disjoint and have min<=max). /// @@ -552,6 +560,7 @@ internal static class Defaults internal const bool TrimWhitespace = false; internal const bool ReadMultilines = false; internal const char EscapeChar = '"'; + internal const bool MissingRealsAsNaNs = false; } /// @@ -1078,7 +1087,8 @@ private static VersionInfo GetVersionInfo() //verWrittenCur: 0x0001000A, // Added ForceVector in Range //verWrittenCur: 0x0001000B, // Header now retained if used and present //verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags - verWrittenCur: 0x0001000D, // Added escapeChar option and decimal marker option to allow for ',' to be a decimal marker + //verWrittenCur: 0x0001000D, // Added escapeChar and decimalMarker chars + verWrittenCur: 0x0001000E, // Added MissingRealsAsNaNs flag verReadableCur: 0x0001000A, verWeCanReadBack: 0x00010009, loaderSignature: LoaderSignature, @@ -1097,7 +1107,8 @@ private enum OptionFlags : uint AllowQuoting = 0x04, AllowSparse = 0x08, ReadMultilines = 0x10, - All = TrimWhitespace | HasHeader | AllowQuoting | AllowSparse | ReadMultilines + MissingRealsAsNaNs = 0x20, + All = TrimWhitespace | HasHeader | AllowQuoting | AllowSparse | ReadMultilines | MissingRealsAsNaNs } // This is reserved to mean the range extends to the end (the segment is variable). @@ -1179,6 +1190,8 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo _flags |= OptionFlags.AllowSparse; if (options.AllowQuoting && options.ReadMultilines) _flags |= OptionFlags.ReadMultilines; + if (options.MissingRealsAsNaNs) + _flags |= OptionFlags.MissingRealsAsNaNs; // REVIEW: This should be persisted (if it should be maintained). _maxRows = options.MaxRows ?? long.MaxValue; @@ -1407,7 +1420,25 @@ private TextLoader(IHost host, ModelLoadContext ctx) _maxRows = ctx.Reader.ReadInt64(); host.CheckDecode(_maxRows > 0); _flags = (OptionFlags)ctx.Reader.ReadUInt32(); - host.CheckDecode((_flags & ~OptionFlags.All) == 0); + + // Flags introduced with the first ML.NET commit: + var acceptableFlags = OptionFlags.TrimWhitespace; + acceptableFlags |= OptionFlags.HasHeader; + acceptableFlags |= OptionFlags.AllowQuoting; + acceptableFlags |= OptionFlags.AllowSparse; + + // Flags added on later versions of TextLoader: + if(ctx.Header.ModelVerWritten >= 0x0001000C) + { + acceptableFlags |= OptionFlags.ReadMultilines; + } + if(ctx.Header.ModelVerWritten >= 0x0001000E) + { + acceptableFlags |= OptionFlags.MissingRealsAsNaNs; + } + + host.CheckDecode((_flags & ~acceptableFlags) == 0); + _inputSize = ctx.Reader.ReadInt32(); host.CheckDecode(0 <= _inputSize && _inputSize < SrcLim); diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs index 897c257e2a..33dba6a843 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs @@ -228,6 +228,8 @@ private abstract class ColumnPipe public abstract bool HasNA { get; } + public abstract bool IsReal { get; } // If the type of the ColumnPipe is either Single or Double + protected ColumnPipe(RowSet rows) { Contracts.AssertValue(rows); @@ -251,6 +253,8 @@ private sealed class PrimitivePipe : ColumnPipe public override bool HasNA { get; } + public override bool IsReal { get; } + public PrimitivePipe(RowSet rows, PrimitiveDataViewType type, TryParseMapper conv) : base(rows) { @@ -259,6 +263,7 @@ public PrimitivePipe(RowSet rows, PrimitiveDataViewType type, TryParseMapper : ColumnPipe public override bool HasNA { get; } + public override bool IsReal { get; } + private class VectorValue { private readonly VectorPipe _pipe; @@ -441,6 +448,7 @@ public VectorPipe(RowSet rows, PrimitiveDataViewType type, TryParseMapper for (int i = 0; i < _values.Length; i++) _values[i] = new VectorValue(this); HasNA = Conversions.DefaultInstance.TryGetIsNAPredicate(type, out var del); + IsReal = typeof(TItem) == typeof(Single) || typeof(TItem) == typeof(Double); } public override void Reset(int irow, int size) @@ -649,6 +657,7 @@ public void Clear() private readonly char[] _separators; private readonly OptionFlags _flags; + private readonly bool _missingRealsAsNaNs; private readonly char _escapeChar; private readonly int _inputSize; private readonly ColInfo[] _infos; @@ -659,6 +668,8 @@ public void Clear() private volatile int _csrc; private volatile int _mismatchCount; + private ReadOnlyMemory _blank; + public Parser(TextLoader parent) { Contracts.AssertValue(parent); @@ -671,6 +682,8 @@ public Parser(TextLoader parent) var doubleParserOptionFlags = DoubleParser.OptionFlags.Default; if (parent._decimalMarker == ',') doubleParserOptionFlags |= DoubleParser.OptionFlags.UseCommaAsDecimalMarker; + if ((parent._flags & OptionFlags.MissingRealsAsNaNs) != 0) + doubleParserOptionFlags |= DoubleParser.OptionFlags.EmptyAsNaN; if (doubleParserOptionFlags == DoubleParser.OptionFlags.Default) cache = ValueCreatorCache.DefaultInstance; @@ -713,6 +726,8 @@ public Parser(TextLoader parent) _flags = parent._flags; _escapeChar = parent._escapeChar; _inputSize = parent._inputSize; + _missingRealsAsNaNs = (parent._flags & OptionFlags.MissingRealsAsNaNs) != 0; + _blank = ReadOnlyMemory.Empty; Contracts.Assert(_inputSize >= 0); } @@ -900,6 +915,7 @@ private sealed class HelperImpl : Helper private readonly int _srcNeeded; private readonly bool _quoting; private readonly bool _sparse; + private readonly bool _keepEmpty; // This is a working buffer. private readonly StringBuilder _sb; @@ -930,6 +946,11 @@ public HelperImpl(ParseStats stats, OptionFlags flags, char[] seps, char escapeC _sb = new StringBuilder(); _blank = ReadOnlyMemory.Empty; Fields = new FieldSet(); + + // If we want to impute empty real fields as NaNs, then we must keep + // all empty field spans, as there's no way for the Parser.HelperImpl + // to know beforehand which fields belong to a float field + _keepEmpty = (flags & OptionFlags.MissingRealsAsNaNs) != 0; } /// @@ -978,6 +999,13 @@ public int GatherFields(ReadOnlyMemory lineSpan, ReadOnlySpan span, Fields.Spans[Fields.Count] = scan.Span; Fields.Indices[Fields.Count++] = src; } + else if(_keepEmpty) + { + Fields.EnsureSpace(); + Fields.Spans[Fields.Count] = _blank; + Fields.Indices[Fields.Count++] = src; + } + if (++src > _srcNeeded || !more) break; } @@ -1390,10 +1418,10 @@ private void ProcessVec(int srcLim, FieldSet fields, ColInfo info, ColumnPipe v, int sizeSeg = lim - min; Contracts.Assert(ivDst <= size - sizeSeg); + int indexBase = ivDst - min; int isrc = fields.Indices.FindIndexSorted(0, fields.Count, min); if (isrc < fields.Count && fields.Indices[isrc] < lim) { - int indexBase = ivDst - min; int isrcLim = fields.Indices.FindIndexSorted(isrc, fields.Count, lim); Contracts.Assert(isrc < isrcLim); for (; isrc < isrcLim; isrc++) @@ -1408,6 +1436,19 @@ private void ProcessVec(int srcLim, FieldSet fields, ColInfo info, ColumnPipe v, } } } + + if(_missingRealsAsNaNs && isrc >= fields.Count && v.IsReal) + { + // If the user has set the MissingRealsAsNaNs option to true, + // And there are missing columns on a given row, + // then we should load them as if they were empty (i.e. _blank) fields + // So that they can be loaded as NaNs if they're single/double columns + // Or as default if they aren't. + for (int srcCur = Math.Max(min, fields.Count); srcCur < lim; srcCur++) + { + v.Consume(irow, indexBase + srcCur, ref _blank); + } + } ivDst += sizeSeg; } Contracts.Assert(ivDst == size); @@ -1430,6 +1471,15 @@ private void ProcessOne(FieldSet vs, ColInfo info, ColumnPipe v, int irow, long v.Rows.Stats.LogBadValue(line, info.Name); } } + else if(_missingRealsAsNaNs && v.IsReal) + { + // If the user has set the MissingRealsAsNaNs option to true, + // And there are missing columns on a given row, + // then we should load them as if they were empty (i.e. _blank) fields + // So that they can be loaded as NaNs if they're single/double columns + // Or as default if they aren't. + v.Consume(irow, 0, ref _blank); + } else v.Reset(irow, 0); } diff --git a/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMissingRealsAsNaNs-1-out.txt b/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMissingRealsAsNaNs-1-out.txt new file mode 100644 index 0000000000..20e86e3ad1 --- /dev/null +++ b/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMissingRealsAsNaNs-1-out.txt @@ -0,0 +1,27 @@ +#@ TextLoader{ +#@ header+ +#@ sep=tab +#@ col=id:R4:0 +#@ col=description:TX:1 +#@ col=date:DT:2 +#@ col=sing1:R4:3 +#@ col=sing2:R4:4 +#@ col=singFt1:R4:5-6 +#@ col=doubFt:R8:7-10 +#@ } +id description date sing1 sing2 num1 num2 num1 num2 num3 num4 +0 this is a description "2001-01-01T%Time%Z" 0.12 0.34 0.12 0.34 0.12 0.34000000000000002 0.56000000000000005 0.78000000000000003 +? this has an empty int and date "0001-01-01T%Time%" 1.1 11.11 1.1 11.11 1.1000000000000001 11.109999999999999 111.111 1111.1111100000001 +? this has a quoted empty int and date "0001-01-01T%Time%" 1.1 11.11 1.1 11.11 1.1000000000000001 11.109999999999999 111.111 1111.1111100000001 +1 this has a quoted int and date "2001-01-01T%Time%Z" 1.1 11.11 1.1 11.11 1.1000000000000001 11.109999999999999 111.111 1111.1111100000001 +2 this has an empty num1 and a space in num3 "2002-02-02T%Time%Z" ? 22.22 ? 22.22 ? 22.219999999999999 ? 2222.2222000000002 +3 this has an empty quoted num1 and a quoted space in num3 "2003-03-03T%Time%Z" ? 33.33 ? 33.33 ? 33.329999999999998 ? 3333.3332999999998 +4 this has a space in num2 and a space in num4 "2004-04-04T%Time%Z" 4.4 ? 4.4 ? 4.4000000000000004 ? 444.44400000000002 ? +5 this has a quoted space num2 and quoted space in num4 "2005-05-05T%Time%Z" 5.5 ? 5.5 ? 5.5 ? 555.55499999999995 ? +6 this has no date, num3 or num4 (the separator corresponding to them is also missing) "0001-01-01T%Time%" 6.6 66.66 6.6 66.66 6.5999999999999996 66.659999999999997 ? ? +7 this has no num4 (the separator corresponding to it is missing) "2007-07-07T%Time%Z" 7.7 77.77 7.7 77.77 7.7000000000000002 77.769999999999996 777.77700000000004 ? +8 this has nothing in num4, but includes the last separator "2008-08-08T%Time%Z" 8.8 88.88 8.8 88.88 8.8000000000000007 88.879999999999995 888.88800000000003 ? +9 "" "2009-09-09T%Time%Z" 9.9 99.99 9.9 99.99 9.9000000000000004 99.989999999999995 999.99900000000002 ? +? "" "2010-10-10T%Time%Z" 10.1 ? 10.1 ? 10.1 ? 101010.10101 ? +11 NaN "2011-11-11T%Time%Z" ? ? ? ? ? ? ? Infinity +Wrote 14 rows of length 11 \ No newline at end of file diff --git a/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMissingRealsAsNaNs-out.txt b/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMissingRealsAsNaNs-out.txt new file mode 100644 index 0000000000..20e86e3ad1 --- /dev/null +++ b/test/BaselineOutput/Common/Command/SavePipeTextLoaderWithMissingRealsAsNaNs-out.txt @@ -0,0 +1,27 @@ +#@ TextLoader{ +#@ header+ +#@ sep=tab +#@ col=id:R4:0 +#@ col=description:TX:1 +#@ col=date:DT:2 +#@ col=sing1:R4:3 +#@ col=sing2:R4:4 +#@ col=singFt1:R4:5-6 +#@ col=doubFt:R8:7-10 +#@ } +id description date sing1 sing2 num1 num2 num1 num2 num3 num4 +0 this is a description "2001-01-01T%Time%Z" 0.12 0.34 0.12 0.34 0.12 0.34000000000000002 0.56000000000000005 0.78000000000000003 +? this has an empty int and date "0001-01-01T%Time%" 1.1 11.11 1.1 11.11 1.1000000000000001 11.109999999999999 111.111 1111.1111100000001 +? this has a quoted empty int and date "0001-01-01T%Time%" 1.1 11.11 1.1 11.11 1.1000000000000001 11.109999999999999 111.111 1111.1111100000001 +1 this has a quoted int and date "2001-01-01T%Time%Z" 1.1 11.11 1.1 11.11 1.1000000000000001 11.109999999999999 111.111 1111.1111100000001 +2 this has an empty num1 and a space in num3 "2002-02-02T%Time%Z" ? 22.22 ? 22.22 ? 22.219999999999999 ? 2222.2222000000002 +3 this has an empty quoted num1 and a quoted space in num3 "2003-03-03T%Time%Z" ? 33.33 ? 33.33 ? 33.329999999999998 ? 3333.3332999999998 +4 this has a space in num2 and a space in num4 "2004-04-04T%Time%Z" 4.4 ? 4.4 ? 4.4000000000000004 ? 444.44400000000002 ? +5 this has a quoted space num2 and quoted space in num4 "2005-05-05T%Time%Z" 5.5 ? 5.5 ? 5.5 ? 555.55499999999995 ? +6 this has no date, num3 or num4 (the separator corresponding to them is also missing) "0001-01-01T%Time%" 6.6 66.66 6.6 66.66 6.5999999999999996 66.659999999999997 ? ? +7 this has no num4 (the separator corresponding to it is missing) "2007-07-07T%Time%Z" 7.7 77.77 7.7 77.77 7.7000000000000002 77.769999999999996 777.77700000000004 ? +8 this has nothing in num4, but includes the last separator "2008-08-08T%Time%Z" 8.8 88.88 8.8 88.88 8.8000000000000007 88.879999999999995 888.88800000000003 ? +9 "" "2009-09-09T%Time%Z" 9.9 99.99 9.9 99.99 9.9000000000000004 99.989999999999995 999.99900000000002 ? +? "" "2010-10-10T%Time%Z" 10.1 ? 10.1 ? 10.1 ? 101010.10101 ? +11 NaN "2011-11-11T%Time%Z" ? ? ? ? ? ? ? Infinity +Wrote 14 rows of length 11 \ No newline at end of file diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index 6a0c17a44f..88d3047d1c 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -319,7 +319,7 @@ { "Name": "AllowQuoting", "Type": "Bool", - "Desc": "Whether the input may include quoted values, which can contain separator characters, colons, and distinguish empty values from missing values. When true, consecutive separators denote a missing value and an empty value is denoted by \"\". When false, consecutive separators denote an empty value.", + "Desc": "Whether the input may include double-quoted values. This parameter is used to distinguish separator characters in an input value from actual separators. When true, separators within double quotes are treated as part of the input value. When false, all separators, even those within quotes, are treated as delimiting a new column.", "Aliases": [ "quote" ], @@ -464,6 +464,18 @@ "SortOrder": 150.0, "IsNullable": false, "Default": "\"" + }, + { + "Name": "MissingRealsAsNaNs", + "Type": "Bool", + "Desc": "If true, empty float fields will be loaded as NaN. If false, they'll be loaded as 0. Default is false.", + "Aliases": [ + "missingrealnan" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false } ] }, diff --git a/test/BaselineOutput/Common/TextLoader/missing_fields-with-impute.csv b/test/BaselineOutput/Common/TextLoader/missing_fields-with-impute.csv new file mode 100644 index 0000000000..b9868d508d --- /dev/null +++ b/test/BaselineOutput/Common/TextLoader/missing_fields-with-impute.csv @@ -0,0 +1,17 @@ +int,description,num1,num2,date,num3,num4 +0,"this is a description",0.12,0.34,01/01/2001,0.56,0.78 +0,"this has an empty int and date", 1.1, 11.11,1/1/0001,111.111,1111.11111 +0,"this has a quoted empty int and date", 1.1, 11.11,1/1/0001,111.111,1111.11111 +1,"this has a quoted int and date", 1.1, 11.11,1/1/2001,111.111,1111.11111 +2,"this has an empty num1 and a space in num3",NaN,22.22,2/2/2002,NaN,2222.2222 +3,"this has an empty quoted num1 and a quoted space in num3",NaN,33.33,3/3/2003,NaN,3333.3333 +4,"this has a space in num2 and a space in num4",4.4,NaN,4/4/2004,444.444,NaN +5,"this has a quoted space num2 and quoted space in num4",5.5,NaN,5/5/2005,555.555,NaN +// The next two rows map the missing columns as NaN, as it was decided to impute them as well +6,"this has no date, num3 or num4 (the separator corresponding to them is also missing)",6.6,66.66,1/1/0001,NaN,NaN +7,"this has no num4 (the separator corresponding to it is missing)",7.7,77.77,7/7/2007,777.777,NaN +// In the next case we do impute with NaN because the separator is there +8,"this has nothing in num4, but includes the last separator",8.8,88.88,8/8/2008,888.888,NaN +9,,9.9,99.99,9/9/2009,999.999,NaN +0,"",10.10,NaN,10/10/2010,101010.101010,NaN +11,NaN,NaN,NaN,11/11/2011,NaN,Infinity \ No newline at end of file diff --git a/test/BaselineOutput/Common/TextLoader/missing_fields-without-impute.csv b/test/BaselineOutput/Common/TextLoader/missing_fields-without-impute.csv new file mode 100644 index 0000000000..95c6874c08 --- /dev/null +++ b/test/BaselineOutput/Common/TextLoader/missing_fields-without-impute.csv @@ -0,0 +1,15 @@ +int,description,num1,num2,date,num3,num4 +0,"this is a description",0.12,0.34,01/01/2001,0.56,0.78 +0,"this has an empty int and date", 1.1, 11.11,1/1/0001,111.111,1111.11111 +0,"this has a quoted empty int and date", 1.1, 11.11,1/1/0001,111.111,1111.11111 +1,"this has a quoted int and date", 1.1, 11.11,1/1/2001,111.111,1111.11111 +2,"this has an empty num1 and a space in num3",0,22.22,2/2/2002,0,2222.2222 +3,"this has an empty quoted num1 and a quoted space in num3",0,33.33,3/3/2003,0,3333.3333 +4,"this has a space in num2 and a space in num4",4.4,0,4/4/2004,444.444,0 +5,"this has a quoted space num2 and quoted space in num4",5.5,0,5/5/2005,555.555,0 +6,"this has no date, num3 or num4 (the separator corresponding to them is also missing)",6.6,66.66,1/1/0001,0,0 +7,"this has no num4 (the separator corresponding to it is missing)",7.7,77.77,7/7/2007,777.777,0 +8,"this has nothing in num4, but includes the last separator",8.8,88.88,8/8/2008,888.888,0 +9,,9.9,99.99,9/9/2009,999.999,NaN +0,,10.10,NaN,10/10/2010,101010.101010,NaN +11,NaN,NaN,NaN,11/11/2011,0,Infinity \ No newline at end of file diff --git a/test/Microsoft.ML.TestFramework/TestCommandBase.cs b/test/Microsoft.ML.TestFramework/TestCommandBase.cs index c1d8c941a2..332ed22a17 100644 --- a/test/Microsoft.ML.TestFramework/TestCommandBase.cs +++ b/test/Microsoft.ML.TestFramework/TestCommandBase.cs @@ -2167,6 +2167,26 @@ public void SavePipeTextLoaderWithMultilines() Done(); } + [TestCategory("DataPipeSerialization")] + [Fact()] + public void SavePipeTextLoaderWithMissingRealsAsNaNs() + { + string dataPath = GetDataPath("missing_fields.csv"); + const string loaderArgs = "loader=text{sep=, quote+ multilines+ header+ escapechar=\\ missingrealnan+ " + + "col=id:Num:0 col=description:TX:1 col=date:DT:4 " + + "col=sing1:R4:2 col=sing2:R4:3 col=singFt1:R4:2-3 " + + "col=doubFt:R8:2-3,5-6}"; + + OutputPath modelPath = ModelPath(); + string extraArgs = null; + TestCore("showdata", dataPath, loaderArgs, extraArgs); + + _step++; + + TestCore("showdata", dataPath, string.Format("in={{{0}}}", modelPath.Path), ""); + Done(); + } + [TestCategory("DataPipeSerialization")] [Fact()] public void SavePipeChooseColumnsByIndexDrop() diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index 358fad1272..b42f4c54f7 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -719,7 +719,7 @@ public void LoaderColumnsFromIrisData(bool useOptionsObject) irisFirstRow["SepalWidth"] = 3.5f; irisFirstRow["PetalLength"] = 1.4f; irisFirstRow["PetalWidth"] = 0.2f; - + var irisFirstRowValues = irisFirstRow.Values.GetEnumerator(); // Simple load @@ -1040,87 +1040,68 @@ public void TestDifferentDecimalMarkersAtTheSameTime(bool useCorrectPeriod, bool DecimalMarker = ',' }; - for (int j = 0; j < 2; j++) - { - // Run various times inside the same test, to also test that TextLoader is only creating 1 - // Custom instance of ValueCreatorCache - IDataView dataViewPeriod; - IDataView dataViewComma; + IDataView dataViewPeriod; + IDataView dataViewComma; - if (useCorrectPeriod) - dataViewPeriod = mlContext.Data.LoadFromTextFile(periodPath, optionsPeriod); - else - dataViewPeriod = mlContext.Data.LoadFromTextFile(commaPath, optionsPeriod); + if (useCorrectPeriod) + dataViewPeriod = mlContext.Data.LoadFromTextFile(periodPath, optionsPeriod); + else + dataViewPeriod = mlContext.Data.LoadFromTextFile(commaPath, optionsPeriod); - if (useCorrectComma) - dataViewComma = mlContext.Data.LoadFromTextFile(commaPath, optionsComma); - else - dataViewComma = mlContext.Data.LoadFromTextFile(periodPath, optionsComma); + if (useCorrectComma) + dataViewComma = mlContext.Data.LoadFromTextFile(commaPath, optionsComma); + else + dataViewComma = mlContext.Data.LoadFromTextFile(periodPath, optionsComma); - VBuffer featuresPeriod = default; - VBuffer featuresComma = default; + VBuffer featuresPeriod = default; + VBuffer featuresComma = default; - using (var cursorPeriod = dataViewPeriod.GetRowCursor(dataViewPeriod.Schema)) - using (var cursorComma = dataViewComma.GetRowCursor(dataViewComma.Schema)) + using (var cursorPeriod = dataViewPeriod.GetRowCursor(dataViewPeriod.Schema)) + using (var cursorComma = dataViewComma.GetRowCursor(dataViewComma.Schema)) + { + var delegatePeriod = cursorPeriod.GetGetter>(dataViewPeriod.Schema["Features"]); + var delegateComma = cursorComma.GetGetter>(dataViewPeriod.Schema["Features"]); + while (cursorPeriod.MoveNext() && cursorComma.MoveNext()) { - var delegatePeriod = cursorPeriod.GetGetter>(dataViewPeriod.Schema["Features"]); - var delegateComma = cursorComma.GetGetter>(dataViewPeriod.Schema["Features"]); - while (cursorPeriod.MoveNext() && cursorComma.MoveNext()) - { - delegatePeriod(ref featuresPeriod); - delegateComma(ref featuresComma); + delegatePeriod(ref featuresPeriod); + delegateComma(ref featuresComma); - var featuresPeriodArray = featuresPeriod.GetValues().ToArray(); - var featuresCommaArray = featuresComma.GetValues().ToArray(); - Assert.Equal(featuresPeriodArray.Length, featuresCommaArray.Length); + var featuresPeriodArray = featuresPeriod.GetValues().ToArray(); + var featuresCommaArray = featuresComma.GetValues().ToArray(); + Assert.Equal(featuresPeriodArray.Length, featuresCommaArray.Length); - for (int i = 0; i < featuresPeriodArray.Length; i++) + for (int i = 0; i < featuresPeriodArray.Length; i++) + { + if (useCorrectPeriod && useCorrectComma) { - if (useCorrectPeriod && useCorrectComma) - { - // Check that none of the two files loadad NaNs - // As both of them should have been loaded correctly - Assert.Equal(featuresPeriodArray[i], featuresCommaArray[i]); - Assert.NotEqual(Single.NaN, featuresPeriodArray[i]); - } - else if (!useCorrectPeriod && !useCorrectComma) - { - // Check that everything was loaded as NaN - // Because the wrong decimal marker was used for both loaders - Assert.Equal(featuresPeriodArray[i], featuresCommaArray[i]); - Assert.Equal(Single.NaN, featuresPeriodArray[i]); - } - else if (!useCorrectPeriod && useCorrectComma) - { - // Check that only the file with commas was loaded correctly - Assert.Equal(Single.NaN, featuresPeriodArray[i]); - Assert.NotEqual(Single.NaN, featuresCommaArray[i]); - } - else - { - // Check that only the file with periods was loaded correctly - Assert.NotEqual(Single.NaN, featuresPeriodArray[i]); - Assert.Equal(Single.NaN, featuresCommaArray[i]); - } + // Check that none of the two files loadad NaNs + // As both of them should have been loaded correctly + Assert.Equal(featuresPeriodArray[i], featuresCommaArray[i]); + Assert.NotEqual(Single.NaN, featuresPeriodArray[i]); + } + else if (!useCorrectPeriod && !useCorrectComma) + { + // Check that everything was loaded as NaN + // Because the wrong decimal marker was used for both loaders + Assert.Equal(featuresPeriodArray[i], featuresCommaArray[i]); + Assert.Equal(Single.NaN, featuresPeriodArray[i]); + } + else if (!useCorrectPeriod && useCorrectComma) + { + // Check that only the file with commas was loaded correctly + Assert.Equal(Single.NaN, featuresPeriodArray[i]); + Assert.NotEqual(Single.NaN, featuresCommaArray[i]); + } + else + { + // Check that only the file with periods was loaded correctly + Assert.NotEqual(Single.NaN, featuresPeriodArray[i]); + Assert.Equal(Single.NaN, featuresCommaArray[i]); } } } - - // Check how many custom instances there are of TextLoader.ValueCreatorCache - var vccType = typeof(TextLoader).GetNestedType("ValueCreatorCache", BindingFlags.NonPublic | BindingFlags.Static); - var customInstancesInfo = vccType.GetField("_customInstances", BindingFlags.NonPublic | BindingFlags.Static); - var customInstancesObject = customInstancesInfo.GetValue(null); - var customInstancesCount = (int)customInstancesObject.GetType().GetProperty("Count").GetValue(customInstancesObject, null); - var customInstancesContainsMethod = customInstancesObject.GetType().GetMethod("ContainsKey"); - - // Regardless of useCorrectPeriod and useCorrectComma - // Since we always created a TextLoader with Comma as DecimalMarker - // There should always be 1, and only 1, custom instance of ValueCreatorCache, corresponding to the comma option - // Even after running multiple times the loop above. - Assert.Equal(1, customInstancesCount); - Assert.True((bool)customInstancesContainsMethod.Invoke(customInstancesObject, new[] { (object) DoubleParser.OptionFlags.UseCommaAsDecimalMarker })); } } @@ -1398,7 +1379,7 @@ public void TestInvalidMultilineCSVQuote() new TextLoader.Column("id", DataKind.Int32, 0), new TextLoader.Column("description", DataKind.String, 1), new TextLoader.Column("animal", DataKind.String, 2), - }, + }, }; var data = mlContext.Data.LoadFromTextFile(filePath, options); @@ -1416,5 +1397,177 @@ public void TestInvalidMultilineCSVQuote() Assert.True(threwException, "Invalid file should have thrown an exception"); } + + [Theory] + [InlineData(true)] + [InlineData(false)] + public void TestLoadTextWithEmptyFloat(bool useImputeEmptyFloats) + { + var mlContext = new MLContext(seed: 1); + var inputPath = GetDataPath("missing_fields.csv"); + var baselineWithImpute = GetBaselinePath("TextLoader", "missing_fields-with-impute.csv"); + var baselineWithoutImpute = GetBaselinePath("TextLoader", "missing_fields-without-impute.csv"); + + var options = new TextLoader.Options() + { + HasHeader = true, + Separator = ",", + AllowQuoting = true, + Columns = new[] + { + new TextLoader.Column("id", DataKind.Int32, 0), + new TextLoader.Column("description", DataKind.String, 1), + new TextLoader.Column("date", DataKind.DateTime, 4), + new TextLoader.Column("sing1", DataKind.Single, 2), + new TextLoader.Column("sing2", DataKind.Single, 3), + new TextLoader.Column("singFt1", DataKind.Single, new [] { new TextLoader.Range(2,3) } ), + new TextLoader.Column("sing3", DataKind.Single, 5), + new TextLoader.Column("sing4", DataKind.Single, 6), + new TextLoader.Column("singFt2", DataKind.Single, new [] { new TextLoader.Range(2,3), new TextLoader.Range(5,6) } ), + new TextLoader.Column("doub1", DataKind.Double, 2), + new TextLoader.Column("doub2", DataKind.Double, 3), + new TextLoader.Column("doubFt1", DataKind.Double, new [] { new TextLoader.Range(2,3) } ), + new TextLoader.Column("doub3", DataKind.Double, 5), + new TextLoader.Column("doub4", DataKind.Double, 6), + new TextLoader.Column("doubFt2", DataKind.Double, new [] { new TextLoader.Range(2,3), new TextLoader.Range(5,6) } ) + }, + }; + + IDataView baselineDV; + IDataView testDV ; + if(useImputeEmptyFloats) + { + baselineDV = mlContext.Data.LoadFromTextFile(baselineWithImpute, options); + options.MissingRealsAsNaNs = true; + testDV = mlContext.Data.LoadFromTextFile(inputPath, options); + } + else + { + baselineDV = mlContext.Data.LoadFromTextFile(baselineWithoutImpute, options); + testDV = mlContext.Data.LoadFromTextFile(inputPath, options); + } + + Int32 baselineId = default; + ReadOnlyMemory baselineDescription = default; + DateTime baselineDate = default; + Single baselineSing1 = default; + Single baselineSing2 = default; + Single baselineSing3 = default; + Single baselineSing4 = default; + Double baselineDoub1 = default; + Double baselineDoub2 = default; + Double baselineDoub3 = default; + Double baselineDoub4 = default; + + Int32 testId = default; + ReadOnlyMemory testDescription = default; + DateTime testDate = default; + Single testSing1 = default; + Single testSing2 = default; + Single testSing3 = default; + Single testSing4 = default; + VBuffer testSingFt1 = default; + VBuffer testSingFt2 = default; + Double testDoub1 = default; + Double testDoub2 = default; + Double testDoub3 = default; + Double testDoub4 = default; + VBuffer testDoubFt1 = default; + VBuffer testDoubFt2 = default; + + using (var cursorBaseline = baselineDV.GetRowCursor(baselineDV.Schema)) + using (var cursorTest = testDV.GetRowCursor(testDV.Schema)) + { + var delegateBaselineId = cursorBaseline.GetGetter(baselineDV.Schema["id"]); + var delegateBaselineDescription = cursorBaseline.GetGetter>(baselineDV.Schema["description"]); + var delegateBaselineDate = cursorBaseline.GetGetter(baselineDV.Schema["date"]); + var delegateBaselineSing1 = cursorBaseline.GetGetter(baselineDV.Schema["sing1"]); + var delegateBaselineSing2 = cursorBaseline.GetGetter(baselineDV.Schema["sing2"]); + var delegateBaselineSing3 = cursorBaseline.GetGetter(baselineDV.Schema["sing3"]); + var delegateBaselineSing4 = cursorBaseline.GetGetter(baselineDV.Schema["sing4"]); + var delegateBaselineDoub1 = cursorBaseline.GetGetter(baselineDV.Schema["doub1"]); + var delegateBaselineDoub2 = cursorBaseline.GetGetter(baselineDV.Schema["doub2"]); + var delegateBaselineDoub3 = cursorBaseline.GetGetter(baselineDV.Schema["doub3"]); + var delegateBaselineDoub4 = cursorBaseline.GetGetter(baselineDV.Schema["doub4"]); + + var delegateTestId = cursorTest.GetGetter(testDV.Schema["id"]); + var delegateTestDescription = cursorTest.GetGetter>(testDV.Schema["description"]); + var delegateTestDate = cursorTest.GetGetter(testDV.Schema["date"]); + var delegateTestSing1 = cursorTest.GetGetter(testDV.Schema["sing1"]); + var delegateTestSing2 = cursorTest.GetGetter(testDV.Schema["sing2"]); + var delegateTestSing3 = cursorTest.GetGetter(testDV.Schema["sing3"]); + var delegateTestSing4 = cursorTest.GetGetter(testDV.Schema["sing4"]); + var delegateTestSingFt1 = cursorTest.GetGetter>(testDV.Schema["singFt1"]); + var delegateTestSingFt2 = cursorTest.GetGetter>(testDV.Schema["singFt2"]); + var delegateTestDoub1 = cursorTest.GetGetter(testDV.Schema["doub1"]); + var delegateTestDoub2 = cursorTest.GetGetter(testDV.Schema["doub2"]); + var delegateTestDoub3 = cursorTest.GetGetter(testDV.Schema["doub3"]); + var delegateTestDoub4 = cursorTest.GetGetter(testDV.Schema["doub4"]); + var delegateTestDoubFt1 = cursorTest.GetGetter>(testDV.Schema["doubFt1"]); + var delegateTestDoubFt2 = cursorTest.GetGetter>(testDV.Schema["doubFt2"]); + + + while (cursorBaseline.MoveNext() && cursorTest.MoveNext()) + { + delegateBaselineId(ref baselineId); + delegateBaselineDescription(ref baselineDescription); + delegateBaselineDate(ref baselineDate); + delegateBaselineSing1(ref baselineSing1); + delegateBaselineSing2(ref baselineSing2); + delegateBaselineSing3(ref baselineSing3); + delegateBaselineSing4(ref baselineSing4); + delegateBaselineDoub1(ref baselineDoub1); + delegateBaselineDoub2(ref baselineDoub2); + delegateBaselineDoub3(ref baselineDoub3); + delegateBaselineDoub4(ref baselineDoub4); + + delegateTestId(ref testId); + delegateTestDescription(ref testDescription); + delegateTestDate(ref testDate); + delegateTestSing1(ref testSing1); + delegateTestSing2(ref testSing2); + delegateTestSing3(ref testSing3); + delegateTestSing4(ref testSing4); + delegateTestSingFt1(ref testSingFt1); + delegateTestSingFt2(ref testSingFt2); + delegateTestDoub1(ref testDoub1); + delegateTestDoub2(ref testDoub2); + delegateTestDoub3(ref testDoub3); + delegateTestDoub4(ref testDoub4); + delegateTestDoubFt1(ref testDoubFt1); + delegateTestDoubFt2(ref testDoubFt2); + + Assert.Equal(baselineId, testId); + Assert.Equal(baselineDescription.ToString(), testDescription.ToString()); + Assert.Equal(baselineDate, testDate); + Assert.Equal(baselineSing1, testSing1); + Assert.Equal(baselineSing2, testSing2); + Assert.Equal(baselineSing3, testSing3); + Assert.Equal(baselineSing4, testSing4); + Assert.Equal(baselineDoub1, testDoub1); + Assert.Equal(baselineDoub2, testDoub2); + Assert.Equal(baselineDoub3, testDoub3); + Assert.Equal(baselineDoub4, testDoub4); + + var testSingFt1Arr = testSingFt1.DenseValues().ToArray(); + var testSingFt2Arr = testSingFt2.DenseValues().ToArray(); + Assert.Equal(baselineSing1, testSingFt1Arr[0]); + Assert.Equal(baselineSing2, testSingFt1Arr[1]); + Assert.Equal(baselineSing1, testSingFt2Arr[0]); + Assert.Equal(baselineSing2, testSingFt2Arr[1]); + Assert.Equal(baselineSing3, testSingFt2Arr[2]); + Assert.Equal(baselineSing4, testSingFt2Arr[3]); + + var testDoubFt1Arr = testDoubFt1.DenseValues().ToArray(); + var testDoubFt2Arr = testDoubFt2.DenseValues().ToArray(); + Assert.Equal(baselineDoub1, testDoubFt1Arr[0]); + Assert.Equal(baselineDoub2, testDoubFt1Arr[1]); + Assert.Equal(baselineDoub1, testDoubFt2Arr[0]); + Assert.Equal(baselineDoub2, testDoubFt2Arr[1]); + Assert.Equal(baselineDoub3, testDoubFt2Arr[2]); + Assert.Equal(baselineDoub4, testDoubFt2Arr[3]); + } + } + } } } diff --git a/test/data/missing_fields.csv b/test/data/missing_fields.csv new file mode 100644 index 0000000000..1eb725b584 --- /dev/null +++ b/test/data/missing_fields.csv @@ -0,0 +1,15 @@ +int,description,num1,num2,date,num3,num4 +0,"this is a description",0.12,0.34,01/01/2001,0.56,0.78 +,"this has an empty int and date", 1.1, 11.11,,111.111,1111.11111 +"","this has a quoted empty int and date", 1.1, 11.11,"",111.111,1111.11111 +"1","this has a quoted int and date", 1.1, 11.11,"1/1/2001",111.111,1111.11111 +2,"this has an empty num1 and a space in num3",,22.22,2/2/2002, ,2222.2222 +3,"this has an empty quoted num1 and a quoted space in num3","",33.33,3/3/2003," ",3333.3333 +4,"this has a space in num2 and a space in num4",4.4, ,4/4/2004,444.444, +5,"this has a quoted space num2 and quoted space in num4",5.5,"",5/5/2005,555.555," " +6,"this has no date, num3 or num4 (the separator corresponding to them is also missing)",6.6,66.66 +7,"this has no num4 (the separator corresponding to it is missing)",7.7,77.77,7/7/2007,777.777 +8,"this has nothing in num4, but includes the last separator",8.8,88.88,8/8/2008,888.888, +9,"",9.9,99.99,9/9/2009,999.999,NaN +,,10.10,?,10/10/2010,101010.101010,? +11,NaN,?,NaN,11/11/2011,,Infinity \ No newline at end of file