diff --git a/docs/code/IDataViewTypeSystem.md b/docs/code/IDataViewTypeSystem.md index 76e32c22ca..cabf3dd54e 100644 --- a/docs/code/IDataViewTypeSystem.md +++ b/docs/code/IDataViewTypeSystem.md @@ -288,7 +288,7 @@ true/false values. The `BooleanDataViewType` class derives from The default value of `BL` is `false`, and it has no `NA` value. -There is a standard conversion from `TX` to `BL`. There are standard +There are standard conversions from `TX` to `BL`, and from `BL` to `TX`. There are standard conversions from `BL` to all signed integer and floating point numeric types, with `false` mapping to zero and `true` mapping to one. @@ -332,7 +332,8 @@ values being the canonical `NA` values. There are standard conversions from each floating-point type to the other floating-point type. There are also standard conversions from text to each -floating-point type and from each integer type to each floating-point type. +floating-point type, from floating-point type to text types, and from each +integer type to each floating-point type. ### Signed Integer Types @@ -342,8 +343,8 @@ default value of each of these is zero. There are standard conversions from each signed integer type to every other signed integer type. There are also standard conversions from text to each -signed integer type and from each signed integer type to each floating-point -type. +signed integer type, from each signed integer type to text, and from each +signed integer type to each floating-point type. Note that we have not defined standard conversions from floating-point types to signed integer types. @@ -357,8 +358,8 @@ have an `NA` value. There are standard conversions from each unsigned integer type to every other unsigned integer type. There are also standard conversions from text to each -unsigned integer type and from each unsigned integer type to each floating- -point type. +unsigned integer type, each unsigned integer type to text, and from each unsigned +integer type to each floating-point type. Note that we have not defined standard conversions from floating-point types to unsigned integer types, or between signed integer types and unsigned @@ -541,6 +542,13 @@ case, it is simple to map implicit items (suppressed due to sparsity) to zero. In the former case, these items are first mapped to the empty text value. To get the same result, we need empty text to map to zero. +### To Text + +There are standard conversions to `TX` from the standard primitive types, +`R4`, `R8`, `I1`, `I2`, `I4`, `I8`, `U1`, `U2`, `U4`, `U8`, `BL`, `TS`, `DT`, and `DZ`. +`R4` uses the G7 format and `R8` uses the G17 format. `BL` converts to "True" or "False". +`TS` uses the format "0:c". `DT` and `DZ` use the "0:o" format. + ### Floating Point There are standard conversions from `R4` to `R8` and from `R8` to `R4`. These diff --git a/src/Microsoft.ML.Data/Data/Conversion.cs b/src/Microsoft.ML.Data/Data/Conversion.cs index 82de43bb48..ead72b3adb 100644 --- a/src/Microsoft.ML.Data/Data/Conversion.cs +++ b/src/Microsoft.ML.Data/Data/Conversion.cs @@ -114,6 +114,7 @@ private Conversions() AddStd(Convert); AddAux(Convert); AddStd(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); @@ -123,6 +124,7 @@ private Conversions() AddStd(Convert); AddAux(Convert); AddStd(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); @@ -132,6 +134,7 @@ private Conversions() AddStd(Convert); AddAux(Convert); AddStd(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); @@ -141,6 +144,7 @@ private Conversions() AddStd(Convert); AddAux(Convert); AddStd(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); @@ -151,6 +155,7 @@ private Conversions() AddStd(Convert); AddAux(Convert); AddStd(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); @@ -161,6 +166,7 @@ private Conversions() AddStd(Convert); AddAux(Convert); AddStd(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); @@ -171,6 +177,7 @@ private Conversions() AddStd(Convert); AddAux(Convert); AddStd(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); @@ -181,6 +188,7 @@ private Conversions() AddStd(Convert); AddAux(Convert); AddStd(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); @@ -188,16 +196,19 @@ private Conversions() AddStd(Convert); // REVIEW: Conversion from UG to R4/R8, should we? AddAux(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); AddStd(Convert); AddAux(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); AddStd(Convert); AddAux(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); @@ -225,22 +236,26 @@ private Conversions() AddStd(Convert); AddStd(Convert); AddAux(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); AddStd(Convert); AddAux(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); AddStd(Convert); AddStd(Convert); AddAux(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); AddStd(Convert); AddAux(Convert); + AddStd(Convert); AddIsNA(IsNA); AddIsNA(IsNA); @@ -912,6 +927,24 @@ public void Convert(in BL src, ref SB dst) public void Convert(in DZ src, ref SB dst) { ClearDst(ref dst); dst.AppendFormat("{0:o}", src); } #endregion ToStringBuilder + #region ToTX + public void Convert(in I1 src, ref TX dst) => dst = src.ToString().AsMemory(); + public void Convert(in I2 src, ref TX dst) => dst = src.ToString().AsMemory(); + public void Convert(in I4 src, ref TX dst) => dst = src.ToString().AsMemory(); + public void Convert(in I8 src, ref TX dst) => dst = src.ToString().AsMemory(); + public void Convert(in U1 src, ref TX dst) => dst = src.ToString().AsMemory(); + public void Convert(in U2 src, ref TX dst) => dst = src.ToString().AsMemory(); + public void Convert(in U4 src, ref TX dst) => dst = src.ToString().AsMemory(); + public void Convert(in U8 src, ref TX dst) => dst = src.ToString().AsMemory(); + public void Convert(in UG src, ref TX dst) => dst = string.Format("0x{0:x16}{1:x16}", src.High, src.Low).AsMemory(); + public void Convert(in R4 src, ref TX dst) => dst = src.ToString("G7", CultureInfo.InvariantCulture).AsMemory(); + public void Convert(in R8 src, ref TX dst) => dst = src.ToString("G17", CultureInfo.InvariantCulture).AsMemory(); + public void Convert(in BL src, ref TX dst) => dst = src.ToString().AsMemory(); + public void Convert(in TS src, ref TX dst) => dst = string.Format("{0:c}", src).AsMemory(); + public void Convert(in DT src, ref TX dst) => string.Format("{0:o}", src).AsMemory(); + public void Convert(in DZ src, ref TX dst) => string.Format("{0:o}", src).AsMemory(); + #endregion ToTX + #region ToBL public void Convert(in R8 src, ref BL dst) => dst = System.Convert.ToBoolean(src); public void Convert(in R4 src, ref BL dst) => dst = System.Convert.ToBoolean(src); diff --git a/test/Microsoft.ML.Tests/Transformers/ConvertTests.cs b/test/Microsoft.ML.Tests/Transformers/ConvertTests.cs index 7ff7abc797..6ef9214866 100644 --- a/test/Microsoft.ML.Tests/Transformers/ConvertTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/ConvertTests.cs @@ -248,6 +248,26 @@ public void TestConvertWorkout() var expectedConvertedValues = ML.Data.LoadFromEnumerable(allTypesDataConverted); CheckSameValues(expectedConvertedValues, actualConvertedValues); + + var allInputTypesData = new[] { new { A = (sbyte)sbyte.MinValue, B = (byte)byte.MinValue, C = double.MaxValue, D = float.MinValue, E = "already a string", F = false } }; + var allInputTypesDataView = ML.Data.LoadFromEnumerable(allInputTypesData); + var allInputTypesDataPipe = ML.Transforms.Conversion.ConvertType(columns: new[] {new TypeConvertingEstimator.ColumnOptions("A1", DataKind.String, "A"), + new TypeConvertingEstimator.ColumnOptions("B1", DataKind.String, "B"), + new TypeConvertingEstimator.ColumnOptions("C1", DataKind.String, "C"), + new TypeConvertingEstimator.ColumnOptions("D1", DataKind.String, "D"), + new TypeConvertingEstimator.ColumnOptions("E1", DataKind.String, "E"), + new TypeConvertingEstimator.ColumnOptions("F1", DataKind.String, "F"), + }); + + var convertedValues = allInputTypesDataPipe.Fit(allInputTypesDataView).Transform(allInputTypesDataView); + + var expectedValuesData = new[] { new { A = (sbyte)sbyte.MinValue, B = (byte)byte.MinValue, C = double.MaxValue, D = float.MinValue, E = "already a string", F = false, + A1 = "-128", B1 = "0", C1 = "1.7976931348623157E+308", D1 = "-3.402823E+38", E1 = "already a string", F1 = "False" } }; + var expectedValuesDataView = ML.Data.LoadFromEnumerable(expectedValuesData); + + CheckSameValues(expectedValuesDataView, convertedValues); + TestEstimatorCore(allInputTypesDataPipe, allInputTypesDataView); + Done(); }