Skip to content

Commit 05bfd03

Browse files
Merge branch 'main' into 6806_dataframe_append_issues
2 parents ec0a734 + d9dbf99 commit 05bfd03

File tree

8 files changed

+457
-404
lines changed

8 files changed

+457
-404
lines changed

src/Microsoft.Data.Analysis/DataFrame.IO.cs

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -98,18 +98,21 @@ private static Type MaxKind(Type a, Type b)
9898
/// <param name="guessRows">number of rows used to guess types</param>
9999
/// <param name="addIndexColumn">add one column with the row index</param>
100100
/// <param name="encoding">The character encoding. Defaults to UTF8 if not specified</param>
101+
/// <param name="renameDuplicatedColumns">If set to true, columns with repeated names are auto-renamed.</param>
102+
/// <param name="cultureInfo">culture info for formatting values</param>
101103
/// <returns>DataFrame</returns>
102104
public static DataFrame LoadCsv(string filename,
103105
char separator = ',', bool header = true,
104106
string[] columnNames = null, Type[] dataTypes = null,
105107
int numRows = -1, int guessRows = 10,
106-
bool addIndexColumn = false, Encoding encoding = null)
108+
bool addIndexColumn = false, Encoding encoding = null,
109+
bool renameDuplicatedColumns = false, CultureInfo cultureInfo = null)
107110
{
108111
using (Stream fileStream = new FileStream(filename, FileMode.Open))
109112
{
110113
return LoadCsv(fileStream,
111114
separator: separator, header: header, columnNames: columnNames, dataTypes: dataTypes, numberOfRowsToRead: numRows,
112-
guessRows: guessRows, addIndexColumn: addIndexColumn, encoding: encoding);
115+
guessRows: guessRows, addIndexColumn: addIndexColumn, encoding: encoding, renameDuplicatedColumns: renameDuplicatedColumns, cultureInfo: cultureInfo);
113116
}
114117
}
115118

@@ -351,8 +354,14 @@ private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringRe
351354
char separator = ',', bool header = true,
352355
string[] columnNames = null, Type[] dataTypes = null,
353356
long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false,
354-
bool renameDuplicatedColumns = false)
357+
bool renameDuplicatedColumns = false,
358+
CultureInfo cultureInfo = null)
355359
{
360+
if (cultureInfo == null)
361+
{
362+
cultureInfo = CultureInfo.CurrentCulture;
363+
}
364+
356365
if (dataTypes == null && guessRows <= 0)
357366
{
358367
throw new ArgumentException(string.Format(Strings.ExpectedEitherGuessRowsOrDataTypes, nameof(guessRows), nameof(dataTypes)));
@@ -452,7 +461,7 @@ private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringRe
452461
}
453462
else
454463
{
455-
ret.Append(fields, inPlace: true);
464+
ret.Append(fields, inPlace: true, cultureInfo: cultureInfo);
456465
}
457466
++rowline;
458467
}
@@ -508,7 +517,6 @@ public TextReader GetTextReader()
508517
}
509518

510519
}
511-
512520
}
513521

514522
/// <summary>
@@ -522,14 +530,18 @@ public TextReader GetTextReader()
522530
/// <param name="numberOfRowsToRead">number of rows to read not including the header(if present)</param>
523531
/// <param name="guessRows">number of rows used to guess types</param>
524532
/// <param name="addIndexColumn">add one column with the row index</param>
533+
/// <param name="renameDuplicatedColumns">If set to true, columns with repeated names are auto-renamed.</param>
534+
/// <param name="cultureInfo">culture info for formatting values</param>
525535
/// <returns><see cref="DataFrame"/></returns>
526536
public static DataFrame LoadCsvFromString(string csvString,
527537
char separator = ',', bool header = true,
528538
string[] columnNames = null, Type[] dataTypes = null,
529-
long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false)
539+
long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false,
540+
bool renameDuplicatedColumns = false,
541+
CultureInfo cultureInfo = null)
530542
{
531543
WrappedStreamReaderOrStringReader wrappedStreamReaderOrStringReader = new WrappedStreamReaderOrStringReader(csvString);
532-
return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn);
544+
return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn, renameDuplicatedColumns, cultureInfo);
533545
}
534546

535547
/// <summary>
@@ -545,12 +557,13 @@ public static DataFrame LoadCsvFromString(string csvString,
545557
/// <param name="addIndexColumn">add one column with the row index</param>
546558
/// <param name="encoding">The character encoding. Defaults to UTF8 if not specified</param>
547559
/// <param name="renameDuplicatedColumns">If set to true, columns with repeated names are auto-renamed.</param>
560+
/// <param name="cultureInfo">culture info for formatting values</param>
548561
/// <returns><see cref="DataFrame"/></returns>
549562
public static DataFrame LoadCsv(Stream csvStream,
550563
char separator = ',', bool header = true,
551564
string[] columnNames = null, Type[] dataTypes = null,
552565
long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false,
553-
Encoding encoding = null, bool renameDuplicatedColumns = false)
566+
Encoding encoding = null, bool renameDuplicatedColumns = false, CultureInfo cultureInfo = null)
554567
{
555568
if (!csvStream.CanSeek)
556569
{
@@ -563,7 +576,7 @@ public static DataFrame LoadCsv(Stream csvStream,
563576
}
564577

565578
WrappedStreamReaderOrStringReader wrappedStreamReaderOrStringReader = new WrappedStreamReaderOrStringReader(csvStream, encoding ?? Encoding.UTF8);
566-
return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn, renameDuplicatedColumns);
579+
return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn, renameDuplicatedColumns, cultureInfo);
567580
}
568581

569582
/// <summary>

src/Microsoft.Data.Analysis/DataFrame.cs

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using System;
66
using System.Collections.Generic;
77
using System.Diagnostics;
8+
using System.Globalization;
89
using System.Linq;
910
using System.Text;
1011

@@ -485,12 +486,13 @@ private void ResizeByOneAndAppend(DataFrameColumn column, object value)
485486
/// <remarks> Values are appended based on the column names</remarks>
486487
/// <param name="rows">The rows to be appended to this DataFrame </param>
487488
/// <param name="inPlace">If set, appends <paramref name="rows"/> in place. Otherwise, a new DataFrame is returned with the <paramref name="rows"/> appended</param>
488-
public DataFrame Append(IEnumerable<DataFrameRow> rows, bool inPlace = false)
489+
/// <param name="cultureInfo">culture info for formatting values</param>
490+
public DataFrame Append(IEnumerable<DataFrameRow> rows, bool inPlace = false, CultureInfo cultureInfo = null)
489491
{
490492
DataFrame ret = inPlace ? this : Clone();
491493
foreach (DataFrameRow row in rows)
492494
{
493-
ret.Append(row.GetValues(), inPlace: true);
495+
ret.Append(row.GetValues(), inPlace: true, cultureInfo: cultureInfo);
494496
}
495497
return ret;
496498
}
@@ -502,8 +504,14 @@ public DataFrame Append(IEnumerable<DataFrameRow> rows, bool inPlace = false)
502504
/// <remarks>If <paramref name="row"/> is null, a null value is appended to each column</remarks>
503505
/// <param name="row"></param>
504506
/// <param name="inPlace">If set, appends a <paramref name="row"/> in place. Otherwise, a new DataFrame is returned with an appended <paramref name="row"/> </param>
505-
public DataFrame Append(IEnumerable<object> row = null, bool inPlace = false)
507+
/// <param name="cultureInfo">culture info for formatting values</param>
508+
public DataFrame Append(IEnumerable<object> row = null, bool inPlace = false, CultureInfo cultureInfo = null)
506509
{
510+
if (cultureInfo == null)
511+
{
512+
cultureInfo = CultureInfo.CurrentCulture;
513+
}
514+
507515
DataFrame ret = inPlace ? this : Clone();
508516
IEnumerator<DataFrameColumn> columnEnumerator = ret.Columns.GetEnumerator();
509517
bool columnMoveNext = columnEnumerator.MoveNext();
@@ -531,7 +539,7 @@ public DataFrame Append(IEnumerable<object> row = null, bool inPlace = false)
531539
}
532540
if (value != null)
533541
{
534-
value = Convert.ChangeType(value, column.DataType);
542+
value = Convert.ChangeType(value, column.DataType, cultureInfo);
535543

536544
if (value is null)
537545
{

src/Microsoft.ML.Core/Prediction/IPredictor.cs

Lines changed: 49 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -2,62 +2,61 @@
22
// The .NET Foundation licenses this file to you under the MIT license.
33
// See the LICENSE file in the project root for more information.
44

5-
namespace Microsoft.ML
5+
namespace Microsoft.ML;
6+
7+
/// <summary>
8+
/// Type of prediction task. Note that this is a legacy structure and usage of this should generally be
9+
/// discouraged in future projects. Its presence suggests that there are privileged and supported
10+
/// tasks, and anything outside of this is unsupported. This runs rather contrary to the idea of this
11+
/// being an expandable framework, and it is inappropriately limiting. For legacy pipelines based on
12+
/// <see cref="ITrainer"/> and <see cref="IPredictor"/> it is still useful, but for things based on
13+
/// the <see cref="IEstimator{TTransformer}"/> idiom, it is inappropriate.
14+
/// </summary>
15+
[BestFriend]
16+
internal enum PredictionKind
617
{
7-
/// <summary>
8-
/// Type of prediction task. Note that this is a legacy structure and usage of this should generally be
9-
/// discouraged in future projects. Its presence suggests that there are privileged and supported
10-
/// tasks, and anything outside of this is unsupported. This runs rather contrary to the idea of this
11-
/// being an expandable framework, and it is inappropriately limiting. For legacy pipelines based on
12-
/// <see cref="ITrainer"/> and <see cref="IPredictor"/> it is still useful, but for things based on
13-
/// the <see cref="IEstimator{TTransformer}"/> idiom, it is inappropriate.
14-
/// </summary>
15-
[BestFriend]
16-
internal enum PredictionKind
17-
{
18-
Unknown = 0,
19-
Custom = 1,
18+
Unknown = 0,
19+
Custom = 1,
2020

21-
BinaryClassification = 2,
22-
MulticlassClassification = 3,
23-
Regression = 4,
24-
MultiOutputRegression = 5,
25-
Ranking = 6,
26-
Recommendation = 7,
27-
AnomalyDetection = 8,
28-
Clustering = 9,
29-
SequenceClassification = 10,
21+
BinaryClassification = 2,
22+
MulticlassClassification = 3,
23+
Regression = 4,
24+
MultiOutputRegression = 5,
25+
Ranking = 6,
26+
Recommendation = 7,
27+
AnomalyDetection = 8,
28+
Clustering = 9,
29+
SequenceClassification = 10,
3030

31-
// More to be added later.
32-
}
31+
// More to be added later.
32+
}
3333

34+
/// <summary>
35+
/// Weakly typed version of IPredictor.
36+
/// </summary>
37+
[BestFriend]
38+
internal interface IPredictor
39+
{
3440
/// <summary>
35-
/// Weakly typed version of IPredictor.
41+
/// Return the type of prediction task.
3642
/// </summary>
37-
[BestFriend]
38-
internal interface IPredictor
39-
{
40-
/// <summary>
41-
/// Return the type of prediction task.
42-
/// </summary>
43-
PredictionKind PredictionKind { get; }
44-
}
43+
PredictionKind PredictionKind { get; }
44+
}
4545

46-
/// <summary>
47-
/// A predictor the produces values of the indicated type.
48-
/// REVIEW: Determine whether this is just a temporary shim or long term solution.
49-
/// </summary>
50-
[BestFriend]
51-
internal interface IPredictorProducing<out TResult> : IPredictor
52-
{
53-
}
46+
/// <summary>
47+
/// A predictor the produces values of the indicated type.
48+
/// REVIEW: Determine whether this is just a temporary shim or long term solution.
49+
/// </summary>
50+
[BestFriend]
51+
internal interface IPredictorProducing<out TResult> : IPredictor
52+
{
53+
}
5454

55-
/// <summary>
56-
/// A predictor that produces values and distributions of the indicated types.
57-
/// Note that from a public API perspective this is bad.
58-
/// </summary>
59-
[BestFriend]
60-
internal interface IDistPredictorProducing<out TResult, out TResultDistribution> : IPredictorProducing<TResult>
61-
{
62-
}
55+
/// <summary>
56+
/// A predictor that produces values and distributions of the indicated types.
57+
/// Note that from a public API perspective this is bad.
58+
/// </summary>
59+
[BestFriend]
60+
internal interface IDistPredictorProducing<out TResult, out TResultDistribution> : IPredictorProducing<TResult>
61+
{
6362
}

0 commit comments

Comments
 (0)