Fix CI pipeline, add documentation for 'Encoder' (#31)

gyrdym · web-flow · commit 920b394b464e · 2022-04-22T00:06:00.000+03:00
diff --git a/.github/workflows/ci_pipeline.yml b/.github/workflows/ci_pipeline.yml
@@ -10,18 +10,19 @@ jobs:
   build:
     runs-on: ubuntu-latest
 
-    container:
-      image:  google/dart:beta
-
     steps:
       - uses: actions/checkout@v2
+      - uses: dart-lang/setup-dart@v1
 
       - name: Print Dart SDK version
         run: dart --version
 
       - name: Install dependencies
         run: dart pub get
 
+      - name: Verify formatting
+        run: dart format --output=none --set-exit-if-changed .
+
       - name: Analyze project source
         run: dart analyze --fatal-infos
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog
 
+## 7.0.1
+- Added code formatting checking step to CI pipline
+- Corrected `README` examples
+- Added documentation to `Encoder` factory
+
 ## 7.0.0
 - `ml_datframe` 1.0.0 supported
 - `featureNames` parameter renamed to `columnNames`
diff --git a/README.md b/README.md
@@ -179,8 +179,8 @@ There is a convenient way to organize a sequence of data preprocessing operation
 
 ````dart
 final pipeline = Pipeline(dataFrame, [
-  encodeAsOneHotLabels(featureNames: ['Gender', 'Age', 'City_Category']),
-  encodeAsIntegerLabels(featureNames: ['Stay_In_Current_City_Years', 'Marital_Status']),
+  toOneHotLabels(columnNames: ['Gender', 'Age', 'City_Category']),
+  toIntegerLabels(columnNames: ['Stay_In_Current_City_Years', 'Marital_Status']),
   normalize(),
   standardize(),
 ]);
@@ -192,6 +192,6 @@ Once you create (or rather fit) a pipeline, you may use it further in your appli
 final processed = pipeline.process(dataFrame);
 ````
 
-`encodeAsOneHotLabels`, `encodeAsIntegerLabels`, `normalize` and `standardize` are pipeable operator functions. 
+`toOneHotLabels`, `toIntegerLabels`, `normalize` and `standardize` are pipeable operator functions. 
 The pipeable operator function is a factory that takes fitting data and creates a fitted pipeable entity (e.g., 
 `Normalizer` instance)  
diff --git a/lib/src/encoder/encoder.dart b/lib/src/encoder/encoder.dart
@@ -5,8 +5,47 @@ import 'package:ml_preprocessing/src/encoder/series_encoder/series_encoder_facto
 import 'package:ml_preprocessing/src/encoder/unknown_value_handling_type.dart';
 import 'package:ml_preprocessing/src/pipeline/pipeable.dart';
 
-/// Categorical data encoder factory
+/// Categorical data encoder factory.
+///
+/// Algorithms that process data to create prediction models can't handle
+/// categorical data, since they are based on mathematical equations and work
+/// only with bare numbers. That means that the categorical data should be
+/// converted to numbers.
+///
+/// The factory exposes different ways to convert categorical data into numbers.
 abstract class Encoder implements Pipeable {
+  /// Gets columns by [columnIndices] or [columnNames] ([columnIndices] has a
+  /// precedence over [columnNames]) from [fittingData], collects all unique
+  /// values from the columns and builds a map `raw value` => `encoded value`.
+  /// Once one calls the [process] method, the mapping will be applied.
+  ///
+  /// The mapping is built according to the following rules:
+  ///
+  /// Let's say, one has a list of values denoting a level of education:
+  ///
+  /// ```
+  /// ['BSc', 'BSc', 'PhD', 'High School', 'PhD']
+  /// ```
+  ///
+  /// After applying the encoder, the source sequence will be looking
+  /// like this:
+  ///
+  /// ```
+  /// [[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 1, 0]]
+  /// ```
+  ///
+  /// In other words, the `one-hot` encoder created the following mapping:
+  ///
+  /// `BSc` => [1, 0, 0]
+  ///
+  /// `PhD` => [0, 1, 0]
+  ///
+  /// `High School` => [0, 0, 1]
+  ///
+  /// Keep in mind that if you apply the [process] method to your data, the
+  /// number of columns will be increased since one categorical value in the
+  /// case of one-hot encoding requires several cells. Headers for the new
+  /// columns will be autogenerated from the categorical values.
   factory Encoder.oneHot(
     DataFrame fittingData, {
     Iterable<int>? columnIndices,
@@ -23,6 +62,33 @@ abstract class Encoder implements Pipeable {
         unknownValueHandlingType: unknownValueHandlingType,
       );
 
+  /// Gets columns by [columnIndices] or [columnNames] ([columnIndices] has a
+  /// precedence over [columnNames]) from [fittingData], collects all unique
+  /// values from the columns and builds a map `raw value` => `encoded value`.
+  /// Once one calls the [process] method, the mapping will be applied.
+  ///
+  /// The mapping is built according to the following rules:
+  ///
+  /// Let's say, one has a list of values denoting a level of education:
+  ///
+  /// ```
+  /// ['BSc', 'BSc', 'PhD', 'High School', 'PhD']
+  /// ```
+  ///
+  /// After applying the encoder, the source list will be looking
+  /// like this:
+  ///
+  /// ```
+  /// [0, 0, 1, 2, 1]
+  /// ```
+  ///
+  /// In other words, the `label` encoder created the following mapping:
+  ///
+  /// `BSc` => 0
+  ///
+  /// `PhD` => 1
+  ///
+  /// `High School` => 2
   factory Encoder.label(
     DataFrame fittingData, {
     Iterable<int>? columnIndices,
diff --git a/lib/src/encoder/encoder_type.dart b/lib/src/encoder/encoder_type.dart
@@ -1,6 +1,6 @@
 /// A type of categorical data encoding
 ///
-/// Algorithms that process data to create prediction models can't accept
+/// Algorithms that process data to create prediction models can't handle
 /// categorical data, since they are based on mathematical equations and work
 /// only with bare numbers. That means that the categorical data should be
 /// converted to numbers.
@@ -27,8 +27,8 @@
 ///
 /// `High School` => 2
 ///
-/// [EncoderType.oneHot] converts categorical values into binary sequences. Let's
-/// say, one has a list of values denoting a level of education:
+/// [EncoderType.oneHot] converts categorical values into binary sequences.
+/// Let's say, one has a list of values denoting a level of education:
 ///
 /// ```
 /// ['BSc', 'BSc', 'PhD', 'High School', 'PhD']
@@ -38,16 +38,16 @@
 /// like this:
 ///
 /// ```
-/// [[0, 0, 1], [0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 0]]
+/// [[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 1, 0]]
 /// ```
 ///
 /// In other words, the `one-hot` encoder created the following mapping:
 ///
-/// `BSc` => [0, 0, 1]
+/// `BSc` => [1, 0, 0]
 ///
 /// `PhD` => [0, 1, 0]
 ///
-/// `High School` => [1, 0, 0]
+/// `High School` => [0, 0, 1]
 enum EncoderType {
   oneHot,
   label,
diff --git a/lib/src/normalizer/normalizer_impl.dart b/lib/src/normalizer/normalizer_impl.dart
@@ -12,7 +12,7 @@ class NormalizerImpl implements Normalizer {
   @override
   DataFrame process(DataFrame input) {
     final transformed =
-    input.toMatrix(_dtype).mapRows((row) => row.normalize(_norm));
+        input.toMatrix(_dtype).mapRows((row) => row.normalize(_norm));
 
     return DataFrame.fromMatrix(transformed, header: input.header);
   }
diff --git a/lib/src/standardizer/standardizer_impl.dart b/lib/src/standardizer/standardizer_impl.dart
@@ -5,9 +5,9 @@ import 'package:ml_preprocessing/src/standardizer/standardizer.dart';
 
 class StandardizerImpl implements Standardizer {
   StandardizerImpl(
-      DataFrame fittingData, {
-        DType dtype = DType.float32,
-      })  : _dtype = dtype,
+    DataFrame fittingData, {
+    DType dtype = DType.float32,
+  })  : _dtype = dtype,
         _mean = fittingData.toMatrix(dtype).mean(),
         _deviation = Vector.fromList(
           // TODO: Consider SIMD-aware mapping
@@ -40,7 +40,7 @@ class StandardizerImpl implements Standardizer {
     }
 
     final processedMatrix =
-    inputAsMatrix.mapRows((row) => (row - _mean) / _deviation);
+        inputAsMatrix.mapRows((row) => (row - _mean) / _deviation);
     final discreteColumnNames = input.series
         .where((series) => series.isDiscrete)
         .map((series) => series.name);
diff --git a/pubspec.yaml b/pubspec.yaml
@@ -1,6 +1,6 @@
 name: ml_preprocessing
 description: Popular data preprocessing algorithms for machine learning
-version: 7.0.0
+version: 7.0.1
 homepage: https://github.com/gyrdym/ml_preprocessing
 
 environment:

Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@ class NormalizerImpl implements Normalizer {`
`12`	`12`	`@override`
`13`	`13`	`DataFrame process(DataFrame input) {`
`14`	`14`	`final transformed =`
`15`		`- input.toMatrix(_dtype).mapRows((row) => row.normalize(_norm));`
	`15`	`+ input.toMatrix(_dtype).mapRows((row) => row.normalize(_norm));`
`16`	`16`
`17`	`17`	`return DataFrame.fromMatrix(transformed, header: input.header);`
`18`	`18`	`}`