@@ -5,8 +5,47 @@ import 'package:ml_preprocessing/src/encoder/series_encoder/series_encoder_facto
5
5
import 'package:ml_preprocessing/src/encoder/unknown_value_handling_type.dart' ;
6
6
import 'package:ml_preprocessing/src/pipeline/pipeable.dart' ;
7
7
8
- /// Categorical data encoder factory
8
+ /// Categorical data encoder factory.
9
+ ///
10
+ /// Algorithms that process data to create prediction models can't handle
11
+ /// categorical data, since they are based on mathematical equations and work
12
+ /// only with bare numbers. That means that the categorical data should be
13
+ /// converted to numbers.
14
+ ///
15
+ /// The factory exposes different ways to convert categorical data into numbers.
9
16
abstract class Encoder implements Pipeable {
17
+ /// Gets columns by [columnIndices] or [columnNames] ([columnIndices] has a
18
+ /// precedence over [columnNames] ) from [fittingData] , collects all unique
19
+ /// values from the columns and builds a map `raw value` => `encoded value` .
20
+ /// Once one calls the [process] method, the mapping will be applied.
21
+ ///
22
+ /// The mapping is built according to the following rules:
23
+ ///
24
+ /// Let's say, one has a list of values denoting a level of education:
25
+ ///
26
+ /// ```
27
+ /// ['BSc', 'BSc', 'PhD', 'High School', 'PhD']
28
+ /// ```
29
+ ///
30
+ /// After applying the encoder, the source sequence will be looking
31
+ /// like this:
32
+ ///
33
+ /// ```
34
+ /// [[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 1, 0]]
35
+ /// ```
36
+ ///
37
+ /// In other words, the `one-hot` encoder created the following mapping:
38
+ ///
39
+ /// `BSc` => [1, 0, 0]
40
+ ///
41
+ /// `PhD` => [0, 1, 0]
42
+ ///
43
+ /// `High School` => [0, 0, 1]
44
+ ///
45
+ /// Keep in mind that if you apply the [process] method to your data, the
46
+ /// number of columns will be increased since one categorical value in the
47
+ /// case of one-hot encoding requires several cells. Headers for the new
48
+ /// columns will be autogenerated from the categorical values.
10
49
factory Encoder .oneHot (
11
50
DataFrame fittingData, {
12
51
Iterable <int >? columnIndices,
@@ -23,6 +62,33 @@ abstract class Encoder implements Pipeable {
23
62
unknownValueHandlingType: unknownValueHandlingType,
24
63
);
25
64
65
+ /// Gets columns by [columnIndices] or [columnNames] ([columnIndices] has a
66
+ /// precedence over [columnNames] ) from [fittingData] , collects all unique
67
+ /// values from the columns and builds a map `raw value` => `encoded value` .
68
+ /// Once one calls the [process] method, the mapping will be applied.
69
+ ///
70
+ /// The mapping is built according to the following rules:
71
+ ///
72
+ /// Let's say, one has a list of values denoting a level of education:
73
+ ///
74
+ /// ```
75
+ /// ['BSc', 'BSc', 'PhD', 'High School', 'PhD']
76
+ /// ```
77
+ ///
78
+ /// After applying the encoder, the source list will be looking
79
+ /// like this:
80
+ ///
81
+ /// ```
82
+ /// [0, 0, 1, 2, 1]
83
+ /// ```
84
+ ///
85
+ /// In other words, the `label` encoder created the following mapping:
86
+ ///
87
+ /// `BSc` => 0
88
+ ///
89
+ /// `PhD` => 1
90
+ ///
91
+ /// `High School` => 2
26
92
factory Encoder .label (
27
93
DataFrame fittingData, {
28
94
Iterable <int >? columnIndices,
0 commit comments