diff --git a/ZBaselines/Common/EntryPoints/core_ep-list.tsv b/ZBaselines/Common/EntryPoints/core_ep-list.tsv index 47007edaa6..22c2767d7a 100644 --- a/ZBaselines/Common/EntryPoints/core_ep-list.tsv +++ b/ZBaselines/Common/EntryPoints/core_ep-list.tsv @@ -1,7 +1,8 @@ +Data.CustomTextLoader Import a dataset from a text file Microsoft.ML.Runtime.EntryPoints.ImportTextData ImportText Microsoft.ML.Runtime.EntryPoints.ImportTextData+Input Microsoft.ML.Runtime.EntryPoints.ImportTextData+Output Data.DataViewReference Pass dataview from memory to experiment Microsoft.ML.Runtime.EntryPoints.DataViewReference ImportData Microsoft.ML.Runtime.EntryPoints.DataViewReference+Input Microsoft.ML.Runtime.EntryPoints.DataViewReference+Output Data.IDataViewArrayConverter Create and array variable Microsoft.ML.Runtime.EntryPoints.CrossValidationBinaryMacro MakeArray Microsoft.ML.Runtime.EntryPoints.CrossValidationBinaryMacro+ArrayIDataViewInput Microsoft.ML.Runtime.EntryPoints.CrossValidationBinaryMacro+ArrayIDataViewOutput Data.PredictorModelArrayConverter Create and array variable Microsoft.ML.Runtime.EntryPoints.CrossValidationBinaryMacro MakeArray Microsoft.ML.Runtime.EntryPoints.CrossValidationBinaryMacro+ArrayIPredictorModelInput Microsoft.ML.Runtime.EntryPoints.CrossValidationBinaryMacro+ArrayIPredictorModelOutput -Data.TextLoader Import a dataset from a text file Microsoft.ML.Runtime.EntryPoints.ImportTextData ImportText Microsoft.ML.Runtime.EntryPoints.ImportTextData+Input Microsoft.ML.Runtime.EntryPoints.ImportTextData+Output +Data.TextLoader Import a dataset from a text file Microsoft.ML.Runtime.EntryPoints.ImportTextData TextLoader Microsoft.ML.Runtime.EntryPoints.ImportTextData+LoaderInput Microsoft.ML.Runtime.EntryPoints.ImportTextData+Output Models.AnomalyDetectionEvaluator Evaluates an anomaly detection scored dataset. Microsoft.ML.Runtime.Data.Evaluate AnomalyDetection Microsoft.ML.Runtime.Data.AnomalyDetectionMamlEvaluator+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+CommonEvaluateOutput Models.BinaryClassificationEvaluator Evaluates a binary classification scored dataset. Microsoft.ML.Runtime.Data.Evaluate Binary Microsoft.ML.Runtime.Data.BinaryClassifierMamlEvaluator+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+ClassificationEvaluateOutput Models.BinaryCrossValidator Cross validation for binary classification Microsoft.ML.Runtime.EntryPoints.CrossValidationBinaryMacro CrossValidateBinary Microsoft.ML.Runtime.EntryPoints.CrossValidationBinaryMacro+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MacroOutput`1[Microsoft.ML.Runtime.EntryPoints.CrossValidationBinaryMacro+Output] diff --git a/ZBaselines/Common/EntryPoints/core_manifest.json b/ZBaselines/Common/EntryPoints/core_manifest.json index a6309fe36a..6eeb1bf709 100644 --- a/ZBaselines/Common/EntryPoints/core_manifest.json +++ b/ZBaselines/Common/EntryPoints/core_manifest.json @@ -1,5 +1,43 @@ { "EntryPoints": [ + { + "Name": "Data.CustomTextLoader", + "Desc": "Import a dataset from a text file", + "FriendlyName": null, + "ShortName": null, + "Inputs": [ + { + "Name": "InputFile", + "Type": "FileHandle", + "Desc": "Location of the input file", + "Aliases": [ + "data" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "CustomSchema", + "Type": "String", + "Desc": "Custom schema to use for parsing", + "Aliases": [ + "schema" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": null + } + ], + "Outputs": [ + { + "Name": "Data", + "Type": "DataView", + "Desc": "The resulting data view" + } + ] + }, { "Name": "Data.DataViewReference", "Desc": "Pass dataview from memory to experiment", @@ -99,16 +137,325 @@ "IsNullable": false }, { - "Name": "CustomSchema", - "Type": "String", - "Desc": "Custom schema to use for parsing", + "Name": "Arguments", + "Type": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the column", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Type", + "Type": { + "Kind": "Enum", + "Values": [ + "I1", + "U1", + "I2", + "U2", + "I4", + "U4", + "I8", + "U8", + "R4", + "Num", + "R8", + "TX", + "Text", + "TXT", + "BL", + "Bool", + "TimeSpan", + "TS", + "DT", + "DateTime", + "DZ", + "DateTimeZone", + "UG", + "U16" + ] + }, + "Desc": "Type of the items in the column", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Source", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Min", + "Type": "Int", + "Desc": "First index in the range", + "Required": true, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0 + }, + { + "Name": "Max", + "Type": "Int", + "Desc": "Last index in the range", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "AutoEnd", + "Type": "Bool", + "Desc": "This range extends to the end of the line, but should be a fixed number of items", + "Aliases": [ + "auto" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "VariableEnd", + "Type": "Bool", + "Desc": "This range extends to the end of the line, which can vary from line to line", + "Aliases": [ + "var" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "AllOther", + "Type": "Bool", + "Desc": "This range includes only other indices not specified", + "Aliases": [ + "other" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "ForceVector", + "Type": "Bool", + "Desc": "Force scalar columns to be treated as vectors of length one", + "Aliases": [ + "vector" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + } + ] + } + }, + "Desc": "Source index range(s) of the column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "KeyRange", + "Type": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Min", + "Type": "UInt", + "Desc": "First index in the range", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0 + }, + { + "Name": "Max", + "Type": "UInt", + "Desc": "Last index in the range", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Contiguous", + "Type": "Bool", + "Desc": "Whether the key is contiguous", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + } + ] + }, + "Desc": "For a key column, this defines the range of values", + "Aliases": [ + "key" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "Column groups. Each group is specified as name:type:numeric-ranges, eg, col=Features:R4:1-17,26,35-40", + "Aliases": [ + "col" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "UseThreads", + "Type": "Bool", + "Desc": "Use separate parsing threads?", + "Aliases": [ + "threads" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "HeaderFile", + "Type": "String", + "Desc": "File containing a header with feature names. If specified, header defined in the data file (header+) is ignored.", + "Aliases": [ + "hf" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "MaxRows", + "Type": "Int", + "Desc": "Maximum number of rows to produce", + "Aliases": [ + "rows" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "AllowQuoting", + "Type": "Bool", + "Desc": "Whether the input may include quoted values, which can contain separator characters, colons, and distinguish empty values from missing values. When true, consecutive separators denote a missing value and an empty value is denoted by \"\". When false, consecutive separators denote an empty value.", + "Aliases": [ + "quote" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "AllowSparse", + "Type": "Bool", + "Desc": "Whether the input may include sparse representations", + "Aliases": [ + "sparse" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "InputSize", + "Type": "Int", + "Desc": "Number of source columns in the text data. Default is that sparse rows contain their size information.", + "Aliases": [ + "size" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Separator", + "Type": { + "Kind": "Array", + "ItemType": "Char" + }, + "Desc": "Source column separator.", + "Aliases": [ + "sep" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": [ + "\t" + ] + }, + { + "Name": "TrimWhitespace", + "Type": "Bool", + "Desc": "Remove trailing whitespace from lines", + "Aliases": [ + "trim" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "HasHeader", + "Type": "Bool", + "Desc": "Data file has header with feature names. Header is read only if options 'hs' and 'hf' are not specified.", + "Aliases": [ + "header" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + } + ] + }, + "Desc": "Arguments", "Aliases": [ - "schema" + "args" ], - "Required": false, + "Required": true, "SortOrder": 2.0, - "IsNullable": false, - "Default": null + "IsNullable": false } ], "Outputs": [ @@ -117,6 +464,9 @@ "Type": "DataView", "Desc": "The resulting data view" } + ], + "InputKind": [ + "ILearningPipelineLoader" ] }, { @@ -21959,6 +22309,10 @@ } ] }, + { + "Kind": "ILearningPipelineLoader", + "Settings": [] + }, { "Kind": "IMulticlassClassificationOutput", "Settings": [] diff --git a/build.cmd b/build.cmd index 872487cb88..c020999ade 100644 --- a/build.cmd +++ b/build.cmd @@ -1,2 +1,2 @@ -@call %~dp0run.cmd build %* +@call "%~dp0run.cmd" build %* @exit /b %ERRORLEVEL% diff --git a/docs/code/IDataViewDesignPrinciples.md b/docs/code/IDataViewDesignPrinciples.md new file mode 100644 index 0000000000..c3f345bf68 --- /dev/null +++ b/docs/code/IDataViewDesignPrinciples.md @@ -0,0 +1,471 @@ +# IDataView Design Principles + +## Overview + +### Brief Introduction to IDataView + +The *IDataView system* is a set of interfaces and components that provide +efficient, compositional processing of schematized data for machine learning +and advanced analytics applications. It is designed to gracefully and +efficiently handle high dimensional data and large data sets. It does not +directly address distributed data and computation, but is suitable for single +node processing of data partitions belonging to larger distributed data sets. + +IDataView is the data pipeline machinery for ML.NET. Microsoft teams consuming +this library have implemented libraries of IDataView related components +(loaders, transforms, savers, trainers, predictors, etc.) and have validated +the performance, scalability and task flexibility benefits. + +The name IDataView was inspired from the database world, where the term table +typically indicates a mutable body of data, while a view is the result of a +query on one or more tables or views, and is generally immutable. Note that +both tables and views are schematized, being organized into typed columns and +rows conforming to the column types. Views differ from tables in several ways: + +* Views are *composable*. New views are formed by applying transformations + (queries) to other views. In contrast, forming a new table from an existing + table involves copying data, making the tables decoupled; the new table is + not linked to the original table in any way. + +* Views are *virtual*; tables are fully realized/persisted. In other words, a + table contains the values in the rows while a view computes values from + other views or tables, so does not contain or own the values. + +* Views are *immutable*; tables are mutable. Since a view does not contain + values, but merely computes values from its source views, there is no + mechanism for modifying the values. + +Note that immutability and compositionality are critical enablers of +technologies that require reasoning over transformation, like query +optimization and remoting. Immutability is also key for concurrency and thread +safety. Views being virtual minimizes I/O, memory allocation, and computation. +Information is accessed, memory is allocated, and computation is performed, +only when needed to satisfy a local request for information. + +### Design Requirements + +The IDataView design fulfills the following design requirements: + +* **General schema**: Each view carries schema information, which specifies + the names and types of the view's columns, together with metadata associated + with the columns. The system is optimized for a reasonably small number of + columns (hundreds). See [here](#basics). + +* **Open type system**: The column type system is open, in the sense that new + data types can be introduced at any time and in any assembly. There is a set + of standard types (which may grow over time), but there is no registry of + all supported types. See [here](#basics). + +* **High dimensional data support**: The type system for columns includes + homogeneous vector types, so a set of related primitive values can be + grouped into a single vector-valued column. See [here](#vector-types). + +* **Compositional**: The IDataView design supports components of various + kinds, and supports composing multiple primitive components to achieve + higher-level semantics. See [here](#components). + +* **Open component system**: While the ML.NET code has a growing large library + of IDataView components, additional components that interoperate with these + may be implemented in other code bases. See [here](#components). + +* **Cursoring**: The rows of a view are accessed sequentially via a row + cursor. Multiple cursors can be active on the same view, both sequentially + and in parallel. In particular, views support multiple iterations through + the rows. Each cursor has a set of active columns, specified at cursor + construction time. Shuffling is supported via an optional random number + generator passed at cursor construction time. See [here](#cursoring). + +* **Lazy computation**: When only a subset of columns or a subset of rows is + requested, computation for other columns and rows can be, and generally is, + avoided. Certain transforms, loaders, and caching scenarios may be + speculative or eager in their computation, but the default is to perform + only computation needed for the requested columns and rows. See + [here](#lazy-computation-and-active-columns). + +* **Immutability and repeatability**: The data served by a view is immutable + and any computations performed are repeatable. In particular, multiple + cursors on the view produce the same row values in the same order (when + using the same shuffling). See [here](#immutability-and-repeatability). + +* **Memory efficiency**: The IDataView design includes cooperative buffer + sharing patterns that eliminate the need to allocate objects or buffers for + each row when cursoring through a view. See [here](#memory-efficiency). + +* **Batch-parallel computation**: The IDataView system includes the ability to + get a set of cursors that can be executed in parallel, with each individual + cursor serving up a subset of the rows. Splitting into multiple cursors can + be done either at the loader level or at an arbitrary point in a pipeline. + The component that performs splitting also provides the consolidation logic. + This enables computation heavy pipelines to leverage multiple cores without + complicating each individual transform implementation. See + [here](#batch-parallel-cursoring). + +* **Large data support**: Constructing views on data files and cursoring + through the rows of a view does not require the entire data to fit in + memory. Conversely, when the entire data fits, there is nothing preventing + it from being loaded entirely in memory. See [here](#data-size). + +### Design Non-requirements + +The IDataView system design does *not* include the following: + +* **Multi-view schema information**: There is no direct support for specifying + cross-view schema information, for example, that certain columns are primary + keys, and that there are foreign key relationships among tables. However, + the column metadata support, together with conventions, may be used to + represent such information. + +* **Standard ML schema**: The IDataView system does not define, nor prescribe, + standard ML schema representation. For example, it does not dictate + representation of nor distinction between different semantic interpretations + of columns, such as label, feature, score, weight, etc. However, the column + metadata support, together with conventions, may be used to represent such + interpretations. + +* **Row count**: A view is not required to provide its row count. The + `IDataView` interface has a `GetRowCount` method with type `Nullable`. + When this returns `null`, the row count is not available directly from the + view. + +* **Efficient indexed row access**: There is no standard way in the IDataView + system to request the values for a specific row number. While the + `IRowCursor` interface has a `MoveMany(long count)` method, it only supports + moving forward `(count > 0)`, and is not necessarily more efficient than + calling `MoveNext()` repeatedly. See [here](#row-cursor). + +* **Data file formats**: The IDataView system does not dictate storage or + transport formats. It *does* include interfaces for loader and saver + components. The ML.NET code has implementations of loaders and savers for + some binary and text file formats. + +* **Multi-node computation over multiple data partitions**: The IDataView + design is focused on single node computation. We expect that in multi-node + applications, each node will be given its own data partition(s) to operate + on, with aggregation happening outside an IDataView pipeline. + +## Schema and Type System + +### Basics + +IDataView has general schema support, in that a view can have an arbitrary +number of columns, each having an associated name, index, data type, and +optional metadata. + +Column names are case sensitive. Multiple columns can share the same name, in +which case, one of the columns hides the others, in the sense that the name +will map to one of the column indices, the visible one. All user interaction +with columns should be via name, not index, so the hidden columns are +generally invisible to the user. However, hidden columns are often useful for +diagnostic purposes. + +The set of supported column data types forms an open type system, in the sense +that additional types can be added at any time and in any assembly. However, +there is a precisely defined set of standard types including: + +* Text +* Boolean +* Single and Double precision floating point +* Signed integer values using 1, 2, 4, or 8 bytes +* Unsigned integer values using 1, 2, 4, or 8 bytes +* Unsigned 16 byte values for ids and probabilistically unique hashes +* Date time, date time zone, and timespan +* Key types +* Vector types + +The set of standard types will likely be expanded over time. + +The IDataView type system is specified in a separate document, *IDataView Type +System Specification*. + +IDataView provides a general mechanism for associating semantic metadata with +columns, such as designating sets of score columns, names associated with the +individual slots of a vector-valued column, values associated with a key type +column, whether a column's data is normalized, etc. + +While IDataView schema supports an arbitrary number of columns, it, like most +schematized data systems, is designed for a modest number of columns, +typically, limited to a few hundred. When a large number of *features* are +required, the features should be gathered into one or more vector-valued +columns, as discussed in the next section. This is important for both user +experience and performance. + +### Vector Types + +Machine learning and advanced analytics applications often involve high- +dimensional data. For example, a common technique for learning from text, +known as [bag-of-words](https://en.wikipedia.org/wiki/Bag-of-words_model), +represents each word in the text as a numeric feature containing the number of +occurrences of that word. Another technique is indicator or one-hot encoding +of categorical values, where, for example, a text-valued column containing a +person's last name is expanded to a set of features, one for each possible +name (Tesla, Lincoln, Gandhi, Zhang, etc.), with a value of one for the +feature corresponding to the name, and the remaining features having value +zero. Variations of these techniques use hashing in place of dictionary +lookup. With hashing, it is common to use 20 bits or more for the hash value, +producing `2^^20` (about a million) features or more. + +These techniques typically generate an enormous number of features. +Representing each feature as an individual column is far from ideal, both from +the perspective of how the user interacts with the information and how the +information is managed in the schematized system. The solution is to represent +each set of features, whether indicator values, or bag-of-words counts, as a +single vector-valued column. + +A vector type specifies an item type and optional dimensionality information. +The item type must be a primitive, non-vector, type. The optional +dimensionality information specifies, at the basic level, the number of items +in the corresponding vector values. + +When the size is unspecified, the vector type is variable-length, and +corresponding vector values may have any length. A tokenization transform, +that maps a text value to the sequence of individual terms in that text, +naturally produces variable-length vectors of text. Then, a hashing ngram +transform may map the variable-length vectors of text to a bag-of-ngrams +representation, which naturally produces numeric vectors of length `2^^k`, +where `k` is the number of bits used in the hash function. + +### Key Types + +The IDataView system includes the concept of key types. Key types are used for +data that is represented numerically, but where the order and/or magnitude of +the values is not semantically meaningful. For example, hash values, social +security numbers, and the index of a term in a dictionary are all best modeled +with a key type. + +## Components + +The IDataView system includes several standard kinds of components and the +ability to compose them to produce efficient data pipelines. A loader +represents a data source as an `IDataView`. A transform is applied to an +`IDataView` to produce a derived `IDataView`. A saver serializes the data +produced by an `IDataView` to a stream, in some cases in a format that can be +read by a loader. There are other more specific kinds of components defined +and used by the ML.NET code base, for example, scorers, evaluators, joins, and +caches. While there are several standard kinds of components, the set of +component kinds is open. + +### Transforms + +Transforms are a foundational kind of IDataView component. Transforms take an +IDataView as input and produce an IDataView as output. Many transforms simply +"add" one or more computed columns to their input schema. More precisely, +their output schema includes all the columns of the input schema, plus some +additional columns, whose values are computed from some of the input column +values. It is common for an added column to have the same name as an input +column, in which case, the added column hides the input column. Both the +original column and new column are present in the output schema and available +for downstream components (in particular, savers and diagnostic tools) to +inspect. For example, a normalization transform may, for each slot of a +vector-valued column named Features, apply an offset and scale factor and +bundle the results in a new vector-valued column, also named Features. From +the user's perspective (which is entirely based on column names), the Features +column was "modified" by the transform, but the original values are available +downstream via the hidden column. + +Some transforms require training, meaning that their precise behavior is +determined automatically from some training data. For example, normalizers and +dictionary-based mappers, such as the TermTransform, build their state from +training data. Training occurs when the transform is instantiated from user- +provided parameters. Typically, the transform behavior is later serialized. +When deserialized, the transform is not retrained; its behavior is entirely +determined by the serialized information. + +### Composition Examples + +Multiple primitive transforms may be applied to achieve higher-level +semantics. For example, ML.NET's `CategoricalTransform` is the composition of +two more primitive transforms, `TermTransform`, which maps each term to a key +value via a dictionary, and `KeyToVectorTransform`, which maps from key value +to indicator vector. Similarly, `CategoricalHashTransform` is the composition +of `HashTransform`, which maps each term to a key value via hashing, and +`KeyToVectorTransform`. + +Similarly, `WordBagTransform` and `WordHashBagTransform` are each the +composition of three transforms. `WordBagTransform` consists of +`WordTokenizeTransform`, `TermTransform`, and `NgramTransform`, while +`WordHashBagTransform` consists of `WordTokenizeTransform`, `HashTransform`, +and `NgramHashTransform`. + +## Cursoring + +### Row Cursor + +To access the data in a view, one gets a row cursor from the view by calling +the `GetRowCursor` method. The row cursor is a movable window onto a single +row of the view, known as the current row. The row cursor provides the column +values of the current row. The `MoveNext()` method of the cursor advances to +the next row. There is also a `MoveMany(long count)` method, which is +semantically equivalent to calling `MoveNext()` repeatedly, `count` times. + +Note that a row cursor is not thread safe; it should be used in a single +execution thread. However, multiple cursors can be active simultaneously on +the same or different threads. + +### Lazy Computation and Active Columns + +It is common in a data pipeline for a down-stream component to only require a +small subset of the information produced by the pipeline. For example, code +that needs to build a dictionary of all terms used in a particular text column +does not need to iterate over any other columns. Similarly, code to display +the first 100 rows does not need to iterate through all rows. When up-stream +computations are lazy, meaning that they are only performed when needed, these +scenarios execute significantly faster than when the up-stream computation is +eager (always performing all computations). + +The IDataView system enables and encourages components to be lazy in both +column and row directions. + +A row cursor has a set of active columns, determined by arguments passed to +`GetRowCursor`. Generally, the cursor, and any upstream components, will only +perform computation or data movement necessary to provide values of the active +columns. For example, when `TermTransform` builds its term dictionary from its +input `IDataView`, it gets a row cursor from the input view with only the term +column active. Any data loading or computation not required to materialize the +term column is avoided. This is lazy computation in the column direction. + +Generally, creating a row cursor is a very cheap operation. The expense is in +the data movement and computation required to iterate over the rows. If a +cursor is used to iterate over a small subset of the input rows, then +generally, only computation and data movement needed to materialize the +requested rows is performed. This is lazy computation in the row direction. + +### Immutability and Repeatability + +Cursoring through data does not modify input data in any way. The root data is +immutable, and the operations performed to materialize derived data are +repeatable. In particular, the values produced by two cursors constructed from +the same view with the same arguments to `GetRowCursor` will be identical. + +Immutability and repeatability enable transparent caching. For example, when a +learning algorithm or other component requires multiple passes over an +IDataView pipeline that includes non-trivial computation, performance may be +enhanced by either caching to memory or caching to disk. Immutability and +repeatability ensure that inserting caching is transparent to the learning +algorithm. + +Immutability also ensures that execution of a composed data pipeline graph is +safe for parallelism. Without the guarantee of immutability, nodes in a data +flow graph can produce side effects that are visible to other non-dependent +nodes. A system where multiple transforms worked by mutating data would be +impossible to predict or reason about, short of the gross inefficiency of +cloning of the source data to ensure consistency. + +The IDataView system's immutability guarantees enable flexible scheduling +without the need to clone data. + +### Batch Parallel Cursoring + +The `GetRowCursor` method on `IDataView` includes options to allow or +encourage parallel execution. If the view is a transform that can benefit from +parallelism, it requests from its input view, not just a cursor, but a cursor +set. If that view is a transform, it typically requests from its input view a +cursor set, etc., on up the transformation chain. At some point in the chain +(perhaps at a loader), a component, called the splitter, determines how many +cursors should be active, creates those cursors, and returns them together +with a consolidator object. At the other end, the consolidator is invoked to +marshal the multiple cursors back into a single cursor. Intervening levels +simply create a cursor on each input cursor, return that set of cursors as +well as the consolidator. + +The ML.NET code base includes transform base classes that implement the +minimal amount of code required to support this batch parallel cursoring +design. Consequently, most transform implementations do not have any special +code to support batch parallel cursoring. + +### Memory Efficiency + +Cursoring is inherently efficient from a memory allocation perspective. +Executing `MoveNext()` requires no memory allocation. Retrieving primitive +column values from a cursor also requires no memory allocation. To retrieve +vector column values from a cursor, the caller can optionally provide buffers +into which the values should be copied. When the provided buffers are +sufficiently large, no additional memory allocation is required. When the +buffers are not provided or are too small, the cursor allocates buffers of +sufficient size to hold the values. This cooperative buffer sharing protocol +eliminates the need to allocate separate buffers for each row. To avoid any +allocation while iterating, client code only need allocate sufficiently large +buffers up front, outside the iteration loop. + +Note that IDataView allows algorithms that need to materialize data in memory +to do so. Nothing in the system prevents a component from cursoring through +the source data and building a complete in-memory representation of the +information needed, subject, of course, to available memory. + +### Data Size + +For large data scenarios, it is critical that the pipeline support efficient +multiple pass "streaming" from disk. IDataView naturally supports streaming +via cursoring through views. Typically, the root of a view is a loader that +pulls information from a file or other data source. We have implemented both +binary .idv and text-based loaders and savers. New loaders and savers can be +added at any time. + +Note that when the data is small, and repeated passes over the data are +needed, the operating system disk cache transparently enhances performance. +Further, when the data is known to fit in memory, caching, as described above, +provides even better performance. + +### Randomization + +Some training algorithms benefit from randomizing the order of rows produced +by a cursor. An `IDataView` indicates via a property whether it supports +shuffling. If it does, a random number generator passed to its `GetRowCursor` +method indicates shuffling should happen, with seed information pulled from +the random number generator. Serving rows from disk in a random order is quite +difficult to do efficiently (without seeking for each row). The binary .idv +loader has some shuffling support, favoring performance over attempting to +provide a uniform distribution over the permutation space. This level of +support has been validated to be sufficient for machine learning goals (e.g., +in recent work on SA-SDCA algorithm). When the data is all in memory, as it is +when cached, randomizing is trivial. + +## Appendix: Comparison with LINQ + +This section is intended for developers familiar with the .Net +`IEnumerable` interface and the LINQ technologies. + +The `IDataView` interface is, in some sense, similar to `IEnumerable`, and +the IDataView system is similar to the LINQ eco-system. The comparisons below +refer to the `IDataView` and `IEnumerable` interfaces as the core +interfaces of their respective worlds. + +In both worlds, there is a cursoring interface associated with the core +interface. In the IEnumerable world, the cursoring interface is +`IEnumerator`. In the IDataView world, the cursoring interface is +`IRowCursor`. + +Both cursoring interfaces have `MoveNext()` methods for forward-only iteration +through the elements. + +Both cursoring interfaces provide access to information about the current +item. For the IEnumerable world, the access is through the `Current` property +of the enumerator. Note that when `T` is a class type, this suggests that each +item served requires memory allocation. In the IDataView world, there is no +single object that represents the current row. Instead, the values of the +current row are directly accessible via methods on the cursor. This avoids +memory allocation for each row. + +In both worlds, the item type information is carried by both the core +interface and the cursoring interface. In the IEnumerable world, this type +information is part of the .Net type, while in the IDataView world, the type +information is much richer and contained in the schema, rather than in the +.Net type. + +In both worlds, many different classes implement the core interface. In the +IEnumerable world, developers explicitly write some of these classes, but many +more implementing classes are automatically generated by the C# compiler, and +returned from methods written using the C# iterator functionality (`yield +return`). In the IDataView world, developers explicitly write all of the +implementing classes, including all loaders and transforms. Unfortunately, +there is no equivalent `yield return` magic. + +In both worlds, multiple cursors can be created and used. + +In both worlds, computation is naturally lazy in the row direction. In the +IEnumerable world, laziness in the column direction would correspond to the +returned `Current` value of type `T` lazily computing some of its properties. + +In both worlds, streaming from disk is naturally supported. + +Neither world supports indexed item access, nor a guarantee that the number of +items is available without iterating and counting. diff --git a/docs/code/IDataViewImplementation.md b/docs/code/IDataViewImplementation.md new file mode 100644 index 0000000000..63fe48b64d --- /dev/null +++ b/docs/code/IDataViewImplementation.md @@ -0,0 +1,518 @@ +# `IDataView` Implementation + +This document is intended as an essay on the best practices for `IDataView` +implementations. As a prerequisite, we suppose that someone has read, and +mostly understood, the following documents: + +* [Design principles](IDataViewDesignPrinciples.md) and +* [Type system](IDataViewTypeSystem.md). + +and has also read and understood the code documentation for the `IDataView` +and its attendant interfaces. Given that background, we will expand on best +practices and common patterns that go into a successful implementation of +`IDataView`, and motivate them with real examples, and historical learnings. + +Put another way: There are now within the ML.NET codebase many implementations +of `IDataView` and many others in other related code bases that interface with +ML.NET. The corresponding PRs and discussions have resulted in the +accumulation of some information, stuff that is not and perhaps should not be +covered in the specification or XML code documentation, but that is +nonetheless quite valuable to know. That is, not the `IDataView` spec itself, +but many of the logical implications of that spec. + +We will here start with the idioms and practices for `IDataView` generally, +before launching into specific *types* of data views: right now there are two +types of data views that have risen to the dignity of being "general": loaders +and transforms. (There are many "specific" non-general data views: "array" +data views, cache data views, join data views, data views for taking other +abstractions for representing data and phrasing it in a way our code can +understand, but these do not follow any more general pattern as loaders and +transforms do.) + +# Urgency in Adhering to Invariants + +The point of `IDataView` is that it enables composable data pipelines. But +what does that composability, practically, entail? + +There are many implementations of `IDataView` and `IDataTransform` in the +ML.NET codebase. There are, further, many instances of `ITrainer` that consume +those data views. There are more implementations of these currently outside of +this codebase, totaling some hundreds. Astonishingly, they all actually work +well together. The reason why so many transforms can work well with so many +different dataviews as potential inputs, chained in arbitrary and strange ways +we can hardly imagine, and feed well into so many instances of `ITrainer` is +not of course because we wrote code to accommodate the Cartesian product of +all possible inputs, but merely because we assume that any given +implementation of `IDataView` obeys the invariants and principles it must. + +This is a general principal of software engineering, or indeed any +engineering: it is nearly impossible to build any complex system of multiple +parts unless those subcomponents adhere to whatever specifications they're +supposed to, and fulfill their requirements. + +We can to some extent tolerate divergence from the invariants in *some* +components, if they are isolated: we have some losses that behave strangely, +even trainers behave somewhat strangely, sort of. Yet `IDataView` is the +center of our data pipeline, and divergences are more potentially harmful. +There is, for every requirement listed here, actually *something* that is +relying on it. + +The inverse is also true: not only must `IDataView` conform to invariants, +code that consumes `IDataView` should be robust to situations other than the +"happy path." It needn't succeed, but it should at least be able to detect if +data is not in the expected form and throw an error message to the user +telling them how they misused it. + +To give the most common example of what I have seen in PRs: often one designs +a transform or learner whose anticipated usage is that it will be used in +conjunction with another transform "upstream" to prepare the data. (Again, +this is very common: a `KeyToVector` transform for example assumes there's +*something* upstream producing key values.) What happens sometimes is people +forget to check that the input data actually *does* conform to that, with the +result that if a pipeline was composed in some other fashion, there would be +some error. + +The only thing you can really assume is that an `IDataView` behaves "sanely" +according to the contracts of the `IDataView` interface, so that future ML.NET +developers can form some reasonable expectations of how your code behaves, and +also have a prayer of knowing how to maintain the code. It is hard enough to +write software correctly even when the code you're working with actually does +what it is supposed to, and impossible when it doesn't. Anyway, not to belabor +the point: hidden undocumented implicit requirements on the usage + +# Design Decisions + +Presumably you are motivated to read this document because you have some +problem of how to get some data into ML.NET, or process data using ML.NET, or +something along these lines. There is a decision to be made about how to even +engineer a solution. Sometimes it's quite obvious: text featurization +obviously belongs as a transform. But other cases are *less* obvious. We will +talk here about how we think about these things. + +One crucial question is whether something should be a data view at all: Often +there is ambiguity. To give some examples of previously contentious points: +should clustering be *transform* or a *trainer*? What about PCA? What about +LDA? In the end, we decided clustering was a *trainer* and both PCA and LDA +are *transforms*, but this decision was hardly unambiguous. Indeed, what +purpose is served by considering trainers and transforms fundamentally +different things, at all? + +Even once we decide whether something *should* be an `IDataView` of some sort, +the question remains what type of data view. We have some canonical types of +data views: + +If it involves taking data from a stream, like a file, or some sort of stream +of data from a network, or other such thing, we might consider this a +*loader*, that is, it should perhaps implement `IDataLoader`. + +If it involves taking a *single* data view, and transmuting it in some +fashion, **and** the intent is this same transmutation might be applied to +novel data, then it should perhaps implement `IDataTransform`, and be a +transform. + +Now then, consider that not everything should be a loader, or a transform, +even when data could be considered to be read from a stream, or when there is +a data view based on another single data view. The essential purpose of loader +and transforms is that they can exist as part of the data model, that is, they +should be serializable and applicable to new data. A nice rule of thumb is: if +when designing some you can imagine a scenario where you want to apply some +logic to *both* a training set as well as a test set, then it might make sense +to make it a loader or a transform. If not, it probably does not make sense. + +1. Often data comes from some programmatic source, as a starting point for an + ML.NET pipeline. Despite being at the head of the data pipe, it is *not* a + loader, because the data source is not a stream (though it is stream*ing*): + it is a `RowSetDataView`. + +2. During training, data is sometimes cached. the structure that handles the + data caching is a `CacheDataView`. It is absolutely not a transform, + despite taking a single input and being itself an `IDataView`. There is no + reason to make it a transform, because there is no plausible rationale to + make it part of the data model: the decision of whether you want to cache + data during *training* has nothing at all to do with whether you want to + cache data during *scoring*, so there is no point in saving it to the data + model. + +3. The ML.NET API for prediction uses a scheme that phrases input data + programmatically as coming from an enumerable of typed objects: the + underlying programmatic `IDataView` that is constructed to wrap this is + *not* a loader, because it is not part of the data model. It is merely the + entry point to the data model, at least, in typical usage. + +# Why `GetGetter`? + +Let us address something fairly conspicuous. The question almost everyone +asks, when they first start using `IDataView`: what is up with these getters? + +One does not fetch values directly from an `IRow` implementation (including +`IRowCursor`). Rather, one retains a delegate that can be used to fetch +objects, through the `GetGetter` method on `IRow`. This delegate is: + +```csharp +public delegate void ValueGetter(ref TValue value); +``` + +If you are unfamiliar with delegates, [read +this](https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/delegates/). +Anyway: you open a row cursor, you get the delegate through this `GetGetter` +method, and you use this delegate multiple times to fetch the actual column +values as you `MoveNext` through the cursor. + +Some history to motivate this: In the first version of `IDataView` the +`IRowCursor` implementation did not actually have these "getters" but rather +had a method, `GetColumnValue(int col, ref TValue val)`. However, this +has the following problems: + +* **Every** call had to verify that the column was active, +* **Every** call had to verify that `TValue` was of the right type, +* When these were part of, say, a transform in a chain (as they often are, + considering how common transforms are used by ML.NET's users) each access + would be accompanied by a virtual method call to the upstream cursor's + `GetColumnValue`. + +In contrast, consider the situation with these getter delegates. The +verification of whether the column is active happens *exactly* once. The +verification of types happens *exactly* once. Rather than *every* access being +passed up through a chain of dozens of transform cursors, you merely get a +getter from whatever cursor is serving it up, and do every access directly +without having to pass through umpteen virtual method calls (each, naturally, +accompanied by their own checks!). With these preliminaries done, a getter on +every iteration, when called, merely has to just fill in the value: all this +verification work is already taken care of. The practical result of this is +that, for some workloads where the getters merely amounted to assigning +values, the "getter" method became an order of magnitude faster. So: we got +rid of this `GetColumnValue` method, and now work with `GetGetter`. + +# Repeatability + +A single `IDataView` instance should be considered a consistent view onto +data. So: if you open a cursor on the same `IDatView` instance, and access +values for the same columns, it will apparently be a "consistent" view. It is +probably obvious what this mean, but specifically: + +The cursor as returned through `GetRowCursor` (with perhaps an identically +constructed `IRandom` instance) in any iteration should return the same number +of rows on all calls, and with the same values at each row. + +Why is this important? Many machine learning algorithms require multiple +passes over the dataset. Most stochastic methods wouldn't really care if the +data changed, but others are *very* sensitive to changes in the data. For +example, how could an L-BFGS or OWL-QN algorithm effectively compute its +approximation to a Hessian, if the examples from which the per-pass history +are computed were not consistent? How could a dual algorithm like SDCA +function with any accuracy, if the examples associated with any given dual +variable were to change? Consider even a relatively simple transform, like a +forward looking windowed averager, or anything relating to time series. The +implementation of those `ICursor` interfaces often open *two* cursors on the +underlying `IDataView`, one "look ahead" cursor used to gather and calculate +necessary statistics, and another cursor for any data: how could the column +constructed out of that transform be meaningful of the look ahead cursor was +consuming different data from the contemporaneous cursor? There are many +examples of this throughout the codebase. + +Nevertheless: in very specific circumstances we have relaxed this. For +example, some ML.NET API code serves up corrupt `IDataView` implementations +that have their underlying data change, since reconstituting a data pipeline +on fresh data is at the present moment too resource intensive. Nonetheless, +this is wrong: for example, the `TrainingCursorBase` and related subclasses +rely upon the data not changing. Since, however, that is used for *training* +and the prediction engines of the API as used for *scoring*, we accept these. +However this is not, strictly speaking, correct, and this sort of corruption +of `IDataView` should only be considered as a last resort, and only when some +great good can be accomplished through this. We certainly did not accept this +corruption lightly! + +# Norms for the Data Model + +In a similar vein for repeatability and consistency is the notion of the data +model. Unlike repeatability, this topic is a bit specialized: `IDataView` +specifically is not serializable, but both `IDataLoader` and `IDataTransform` +are serializable. Nonetheless those are the two most important types of data +views, so we will treat on them here. + +From a user's perspective, when they run ML.NET and specify a loader or set of +transforms, what they are doing is composing a data pipe. For example, perhaps +they specify a way to load data from, say, a text file, apply some +normalization, some categorical handling, some text, some this, some that, +some everything, and it all just works, and is consistent whether we're +applying that to the training data on which the transforms were defined, or +some other test set, whether we programmatically load the model in the API and +apply it to some production setting, whether we are running in a distributed +environment and want to make sure *all* worker nodes are featurizing data in +exactly the same way, etc. etc. + +The way in which this consistency is accomplished is by having certain +requirements on the essential parts of the data model: loaders and transforms. +The essential reason these things exist is so that they can be applied to new +data in a consistent way. + +Let us formalize this somewhat. We consider two data views to be functionally +identical if there is absolutely no way to distinguish them: they return the +same values, have the same types, same number of rows, they shuffle +identically given identically constructed `IRandom` when row cursors are +constructed, return the same ID for rows from the ID getter, etc. Obviously +this concept is transitive. (Of course, `Batch` in a cursor might be different +between the two, but that is the case even with two cursors constructed on the +same data view.) So some rules: + +1. If you have an `IDataLoader`, then saving/loading the associated data model + on the same data should result in a functionally identical `IDataLoader`. + +2. If you have an `IDataTransform`, then saving/loading the associated data + model for the transforms on functionally identical `IDataView`s, should + itself result in functionally identical `IDataView`s. + +## Versioning + +This requirement for consistency of a data model often has implications across +versions of ML.NET, and our requirements for data model backwards +compatibility. As time has passed, we often feel like it would make sense if a +transform behaved *differently*, that is, if it organized or calculated its +output in a different way than it currently does. For example, suppose we +wanted to switch the hash transform to something a bit more efficient than +murmur hashes, for example. If we did so, presumably the same input values +would map to different outputs. We are free to do so, of course, yet: when we +deserialize a hash transform from before we made this change, that hash +transform should continue to output values as it did, before we made that +change. (This, of course, assuming that the transform was released as part of +a "blessed" non-preview point release of ML.NET. We can, and have, broken +backwards compatibility for something that has not yet been incorporated in +any sort of blessed release, though we prefer to not.) + +## What is Not Functionally Identical + +Note that identically *constructed* data views are not necessarily +*functionally* identical. Consider this usage of the train and score transform +with `xf=trainScore{tr=ap}`, where we first train averaged perceptron, then +copy its score and probability columns out of the way, then construct the +same basic transform again. + +```maml +maml.exe showdata saver=md seed=1 data=breast-cancer.txt xf=trainScore{tr=ap} + xf=copy{col=ScoreA:Score col=ProbA:Probability} xf=trainScore{tr=ap} +``` + +The result is this. + +Label | Features | PredictedLabel | Score | Probability | ScoreA | ProbA +------|------------------------------|----------------|--------|--------------|--------|------- +0 | 5, 1, 1, 1, 2, 1, 3, 1, 1 | 0 | -62.07 | 0.0117 | -75.28 | 0.0107 +0 | 5, 4, 4, 5, 7, 10, 3, 2, 1 | 1 | 88.41 | 0.8173 | 92.04 | 0.8349 +0 | 3, 1, 1, 1, 2, 2, 3, 1, 1 | 0 | -40.53 | 0.0269 | -44.23 | 0.0329 +0 | 6, 8, 8, 1, 3, 4, 3, 7, 1 | 1 | 201.21 | 0.9973 | 208.07 | 0.9972 +0 | 4, 1, 1, 3, 2, 1, 3, 1, 1 | 0 | -43.11 | 0.0243 | -55.32 | 0.0221 +1 | 8, 10, 10, 8, 7, 10, 9, 7, 1 | 1 | 259.22 | 0.9997 | 257.43 | 0.9995 +0 | 1, 1, 1, 1, 2, 10, 3, 1, 1 | 1 | 71.10 | 0.6933 | 89.52 | 0.8218 +0 | 2, 1, 2, 1, 2, 1, 3, 1, 1 | 0 | -38.94 | 0.0286 | -39.59 | 0.0388 +0 | 2, 1, 1, 1, 2, 1, 1, 1, 5 | 0 | -32.87 | 0.0360 | -41.52 | 0.0362 +0 | 4, 2, 1, 1, 2, 1, 2, 1, 1 | 0 | -31.76 | 0.0376 | -41.68 | 0.0360 + +One could argue it's not *really* identically constructed, exactly, since both +of those transforms (including the underlying averaged perceptron learner!) +are initialized using the pseudo-random number generator in an `IHost` that +changes from one to another. But, that's a bit nit-picky. + +Note also: when we say functionally identical we include everything about it: +not just the data, but the schema, its metadata, the implementation of +shuffling, etc. For this reason, while serializing the data *model* has +guarantees of consistency, serializing the *data* has no such guarantee: if +you serialize data using the text saver, practically all metadata (except slot +names) will be completely lost, which can have implications on how some +transforms and downstream processes work. Or: if you serialize data using the +binary saver, suddenly it may become shufflable whereas it may not have been +before. + +The inevitable caveat to all this stuff about "consistency" is that it is +ultimately limited by hardware and other runtime environment factors: the +truth is, certain machines will, with identical programs with seemingly +identical flows of execution result, *sometimes*, in subtly different answers +where floating point values are concerned. Even on the same machine there are +runtime considerations, e.g., when .NET's RyuJIT was introduced in VS2015, we +had lots of test failures around our model consistency tests because the JIT +was compiling the CLI just *slightly* differently. But, this sort of thing +aside (which we can hardly help), we expect the models to be the same. + +# On Loaders, Data Models, and Empty `IMultiStreamSource`s + +When you create a loader you have the option of specifying not only *one* data +input, but any number of data input files, including zero. But there's also a +more general principle at work here with zero files: when deserializing a data +loader from a data model with an `IMultiStreamSource` with `Count == 0` (e.g., +as would be constructed with `new MultiFileSource(null)`), we have a protocol +that *every* `IDataLoader` should work in that circumstance, and merely be a +data view with no rows, but the same schema as it had when it was serialized. +The purpose of this is that we often have circumstances were we need to +understand the schema of the data (what columns were produced, what the +feature names are, etc.) when all we have is the data model. (E.g., the +`savemodel` command, and other things.) + +# Getters Must Fail for Invalid Types + +For a given `IRow`, we must expect that `GetGetter(col)` will throw if +either `IsColumnActive(col)` is `false`, or `typeof(TValue) != +Schema.GetColumnType(col).RawType`, as indicated in the code documentation. +But why? It might seem reasonable to add seemingly "harmless" flexibility to +this interface. So let's imagine your type should be `float`, because the +corresponding column's type's `RawType` is `typeof(float)`. Now: if you +*happen* to call `GetGetter(col)` instead of `GetGetter(col)`, +it would actually be a fairly easy matter for `GetGetter` to actually +accommodate it, by doing the necessary transformations under the hood, and +*not* fail. This type of thinking is actually insidiously and massively +harmful to the codebase, as I will remark. + +The danger of writing code is that there's a chance someone might find it +useful. Imagine a consumer of your dataview actually relies on your +"tolerance." What that means, of course, is that this consuming code cannot +function effectively on any *other* dataview. The consuming code is by +definition *buggy*: it is requesting data of a type we've explicitly claimed, +through the schema, that we do not support. And the developer, through a well +intentioned but misguided design decision, has allowed buggy code to pass a +test it should have failed, thus making the codebase more fragile when, if we +had simply maintained requirements, would have otherwise detected the bug. + +Moreover: it is a solution to a problem that does not exist. `IDataView`s are +fundamentally composable structures already, and one of the most fundamental +operations you can do is transform columns into different types. So, there is +no need for you to do the conversion yourself. Indeed, it is harmful for you +to try: if we have the conversion capability in one place, including the logic +of what can be converted and *how* these things are to be converted, is it +reasonable to suppose we should have it in *every implementation of +`IDataView`?* Certainly not. At best the situation will be needless complexity +in the code: more realistically it will lead to inconsistency, and from +inconsistency, surprises and bugs for users and developers. + +# Thread Safety + +Any `IDataView` implementation, as well as the `ISchema`, *must* be thread +safe. There is a lot of code that depends on this. For example, cross +validation works by operating over the same dataset (just, of course, filtered +to different subsets of the data). That amounts to multiple cursors being +opened, simultaneously, over the same data. + +So: `IDataView` and `ISchema` must be thread safe. However, `IRowCursor`, +being a stateful object, we assume is accessed from exactly one thread at a +time. The `IRowCursor`s returned through a `GetRowCursorSet`, however, which +each single one must be accessed by a single thread at a time, multiple +threads can access this set of cursors simultaneously: that's why we have that +method in the first place. + +# Exceptions and Errors + +There is one non-obvious implication of the lazy evaluation while cursoring +over an `IDataView`: while cursoring, you should almost certainly not throw +exceptions. + +Imagine you have a `TextLoader`. You might expect that if you have a parse +error, e.g., you have a column of floats, and one of the rows has a value +like, `"hi!"` or something otherwise uninterpretable, you would throw. Yet, +consider the implications of lazy evaluation. If that column were not +selected, the cursoring would *succeed*, because it would not look at that +`"hi!"` token *at all*, much less detect that it was not parsable as a float. + +If we were to throw, the effect is that *sometimes* the cursoring will succeed +(if the column is not selected), and *sometimes* will fail (if not selected). +These failures are explainable, ultimately, of course, in the sense that +anything is explainable, but a user knows nothing about lazy evaluation or +anything like this: correspondingly this is enormously confusing. + +The implication is that we should not throw an exception in this case. We +instead consider this value "missing," and we *may* register a warning using +an `IChannel.Warning`, but we cannot fail. + +So: If you could reasonably catch the exception on *any* cursoring over your +`IDataView`, you can throw. If, however, detecting the condition on which you +could throw the exception requires that a certain column be made active, then +you should not throw. Of course, there are extreme circumstances: for example, +one cannot help but throw on a cursoring if, say, there is some weird system +event, and if one somehow detects in a subsequent iteration that something is +fundamentally broken then you can throw: e.g., the binary loader will throw if +it detects the file it is reading is corrupted, even if that corruption may +not have been obvious immediately. + +# `GetGetter` Returning the Same Delegate + +On a single instance of `IRowCursor`, since each `IRowCursor` instance has no +requirement to be thread safe, it is entirely legal for a call to `GetGetter` +on a single column to just return the same getting delegate. It has come to +pass that the majority of implementations of `IRowCursor` actually do that, +since it is in some ways easier to write the code that way. + +This practice has inadvertently enabled a fairly attractive tool for analysis +of data pipelines: by returning the same delegate each time, we can check in a +data pipeline what data is being passed through by seeing whether the +references to getter delegates are being passed through. Now this is +imperfect, because some transforms that could use the same delegate each time +do not, but the vast majority do. + +# Class Structuring + +The essential attendant classes of an `IDataView` are its schema, as returned +through the `Schema` property, as well as the `IRowCursor` implementation(s), +as returned through the `GetRowCursor` and `GetRowCursorSet` methods. The +implementations for those two interfaces are typically nested within the +`IDataView` implementation itself. The cursor implementation is almost always +at the bottom of the data view class. + +# `IRow` and `ICursor` vs. `IRowCursor` + +We have `IRowCursor` which descends from both `IRow` and `ICursor`. Why do +these other interfaces exist? + +Firstly, there are implementations of `IRow` or `ICursor` that are not +`IRowCursor`s. We have occasionally found it useful to have something +resembling a key-value store, but that is strongly, dynamically typed in some +fashion. Why not simply represent this using the same idioms of `IDataView`? +So we put them in an `IRow`. Similarly: we have several things that behave +*like* cursors, but that are in no way *row* cursors. + +However, more than that, there are a number of utility functions where we want +to operate over something like an `IRowCursor`, but we want to have some +indication that this function will not move the cursor (in which case `IRow` +is helpful), or that will not access any values (in which case `ICursor` is +helpful). + +# Schema + +The schema contains information about the columns. As we see in [the design +principles](IDataViewDesignPrinciples.md), it has index, data type, and +optional metadata. + +While *programmatically* accesses to an `IDataView` are by index, from a +user's perspective the indices are by name; most training algorithms +conceptually train on the `Features` column (under default settings). For this +reason nearly all usages of an `IDataView` will be prefixed with a call to the +schema's `TryGetColumnIndex`. + +Regarding name hiding, the principles mention that when multiple columns have +the same name, other columns are "hidden." The convention all implementations +of `ISchema` obey is that the column with the *largest* index. Note however +that this is merely convention, not part of the definition of `ISchema`. + +Implementations of `TryGetColumnIndex` should be O(1), that is, practically, +this mapping ought to be backed with a dictionary in most cases. (There are +obvious exceptions like, say, things like `LineLoader` which produce exactly +one column. There, a simple equality test suffices.) + +It is best if `GetColumnType` returns the *same* object every time. That is, +things like key-types and vector-types, when returned, should not be created +in the function itself (thereby creating a new object every time), but rather +stored somewhere and returned. + +## Metadata + +Since metadata is *optional*, one is not obligated to necessarily produce it, +or conform to any particular schemas for any particular kinds (beyond, say, +the obvious things like making sure that the types and values are consistent). +However, the flip side of that freedom given to *producers*, is that +*consumers* are obligated, when processing a data view input, to react +gracefully when metadata of a certain kind is absent, or not in a form that +one expects. One should *never* fail when input metadata is in a form one does +not expect. + +To give a practical example of this: many transforms, learners, or other +components that process `IDataView`s will do something with the slot names, +but when the `SlotNames` metadata kind for a given column is either absent, +*or* not of the right type (vectors of strings), *or* not of the right size +(same length vectors as the input), the behavior is not to throw or yield +errors or do anything of the kind, but to simply say, "oh, I don't really have +slot names," and proceed as if the slot names hadn't been present at all. \ No newline at end of file diff --git a/docs/code/IDataViewTypeSystem.md b/docs/code/IDataViewTypeSystem.md new file mode 100644 index 0000000000..c152a667cf --- /dev/null +++ b/docs/code/IDataViewTypeSystem.md @@ -0,0 +1,843 @@ +# `IDataView` Type System + +## Overview + +The *IDataView system* consists of a set of interfaces and classes that +provide efficient, compositional transformation of and cursoring through +schematized data, as required by many machine-learning and data analysis +applications. It is designed to gracefully and efficiently handle both +extremely high dimensional data and very large data sets. It does not directly +address distributed data, but is suitable for single node processing of data +partitions belonging to larger distributed data sets. + +While `IDataView` is one interface in this system, colloquially, the term +IDataView is frequently used to refer to the entire system. In this document, +the specific interface is written using fixed pitch font as `IDataView`. + +IDataView is the data pipeline machinery for ML.NET. The ML.NET codebase has +an extensive library of IDataView related components (loaders, transforms, +savers, trainers, predictors, etc.). More are being worked on. + +The name IDataView was inspired from the database world, where the term table +typically indicates a mutable body of data, while a view is the result of a +query on one or more tables or views, and is generally immutable. Note that +both tables and views are schematized, being organized into typed columns and +rows conforming to the column types. Views differ from tables in several ways: + +* Views are immutable; tables are mutable. + +* Views are composable -- new views can be formed by applying transformations + (queries) to other views. Forming a new table from an existing table + involves copying data, making them decoupled—the new table is not linked to + the original table in any way. + +* Views are virtual; tables are fully realized/persisted. + +Note that immutability and compositionality are critical enablers of +technologies that require reasoning over transformation, like query +optimization and remoting. Immutability is also key for concurrency and thread +safety. + +This document includes a very brief introduction to some of the basic concepts +of IDataView, but then focuses primarily on the IDataView type system. + +Why does IDataView need a special type system? The .NET type system is not +well suited to machine-learning and data analysis needs. For example, while +one could argue that `typeof(double[])` indicates a vector of double values, +it explicitly does not include the dimensionality of the vector/array. +Similarly, there is no good way to indicate a subset of an integer type, for +example integers from 1 to 100, as a .NET type. In short, there is no +reasonable way to encode complete range and dimensionality information in a +`System.Type`. + +In addition, a well-defined type system, including complete specification of +standard data types and conversions, enables separately authored components to +seamlessly work together without surprises. + +### Basic Concepts + +`IDataView`, in the narrow sense, is an interface implemented by many +components. At a high level, it is analogous to the .Net interface +`IEnumerable`, with some very significant differences. + +While `IEnumerable` is a sequence of objects of type `T`, `IDataView` is a +sequence of rows. An `IDataView` object has an associated `ISchema` object +that defines the `IDataView`'s columns, including their names, types, indices, +and associated metadata. Each row of the `IDataView` has a value for each +column defined by the schema. + +Just as `IEnumerable` has an associated enumerator interface, namely +`IEnumerator`, `IDataView` has an associated cursor interface, namely +`IRowCursor`. In the enumerable world, an enumerator object implements a +Current property that returns the current value of the iteration as an object +of type `T`. In the IDataView world, an `IRowCursor` object encapsulates the +current row of the iteration. There is no separate object that represents the +current row. Instead, the cursor implements methods that provide the values of +the current row, when requested. Additionally, the methods that serve up +values do not require memory allocation on each invocation, but use sharable +buffers. This scheme significantly reduces the memory allocations needed to +cursor through data. + +Both `IDataView` and `IEnumerable` present a read-only view on data, in the +sense that a sequence presented by each is not directly mutable. +"Modifications" to the sequence are accomplished by additional operators or +transforms applied to the sequence, so do not modify any underlying data. For +example, to normalize a numeric column in an `IDataView` object, a +normalization transform is applied to the sequence to form a new `IDataView` +object representing the composition. In the new view, the normalized values +are contained in a new column. Often, the new column has the same name as the +original source column and "replaces" the source column in the new view. +Columns that are not involved in the transformation are simply "passed +through" from the source `IDataView` to the new one. + +Detailed specifications of the `IDataView`, `ISchema`, and `IRowCursor` +interfaces are in other documents. + +### Column Types + +Each column in an `IDataView` has an associated column type. The collection of +column types is open, in the sense that new code can introduce new column +types without requiring modification of all `IDataView` related components. +While introducing new types is possible, we expect it will also be relatively +rare. + +All column type implementations derive from the abstract class `ColumnType`. +Primitive column types are those whose implementation derives from the +abstract class `PrimitiveType`, which derives from `ColumnType`. + +### Representation Type + +A column type has an associated .Net type, known as its representation type or +raw type. + +Note that a column type often contains much more information than the +associated .Net representation type. Moreover, many distinct column types can +use the same representation type. Consequently, code should not assume that a +particular .Net type implies a particular column type. + +### Standard Column Types + +There is a set of predefined standard column types, divided into standard +primitive types and vector types. Note that there can be types that are +neither primitive nor vector types. These types are not standard types and may +require extra care when handling them. For example, a `PictureType` value +might require disposing when it is no longer needed. + +Standard primitive types include the text type, the boolean type, numeric +types, and key types. Numeric types are further split into floating-point +types, signed integer types, and unsigned integer types. + +A vector type has an associated item type that must be a primitive type, but +need not be a standard primitive type. Note that vector types are not +primitive types, so vectors of vectors are not supported. Note also that +vectors are homogeneous—all elements are of the same type. In addition to its +item type, a vector type contains dimensionality information. At the basic +level, this dimensionality information indicates the length of the vector +type. A length of zero means that the vector type is variable length, that is, +different values may have different lengths. Additional detail of vector types +is in a subsequent section. Vector types are instances of the sealed class +`VectorType`, which derives from `ColumnType`. + +This document uses convenient shorthand for standard types: + +* `TX`: text + +* `BL`: boolean + +* `R4`, `R8`: single and double precision floating-point + +* `I1`, `I2`, `I4`, `I8`: signed integer types with the indicated number of + bytes + +* `U1`, `U2`, `U4`, `U8`: unsigned integer types with the indicated number of + bytes + +* `UG`: unsigned type with 16-bytes, typically used as a unique ID + +* `TS`: timespan, a period of time + +* `DT`: datetime, a date and time but no timezone + +* `DZ`: datetime zone, a date and time with a timezone + +* `U4[100-199]`: A key type based on `U4` representing legal values from 100 + to 199, inclusive + +* `V`: A vector type with item type `R4` and dimensionality + information [3,2] + +See the sections on the specific types for more detail. + +The IDataView system includes many standard conversions between standard +primitive types. A later section contains a full specification of these +conversions. + +### Default Value + +Each column type has an associated default value corresponding to the default +value of its representation type, as defined by the .Net (C# and CLR) +specifications. + +The standard conversions map source default values to destination default +values. For example, the standard conversion from `TX` to `R8` maps the empty +text value to the value zero. Note that the empty text value is distinct from +the missing text value, as discussed next. + +### Missing Value + +Most of the standard primitive types support the notion of a missing value. In +particular, the text type, floating-point types, signed integer types, and key +types all have an internal representation of missing. We follow R's lead and +denote such values as `NA`. + +Unlike R, the standard primitive types do not distinguish between missing and +invalid. For example, in floating-point arithmetic, computing zero divided by +zero, or infinity minus infinity, produces an invalid value known as a `NaN` +(for Not-a-Number). R uses a specific `NaN` value to represent its `NA` value, +with all other `NaN` values indicating invalid. The IDataView standard +floating-point types do not distinguish between the various `NaN` values, +treating them all as missing/invalid. + +A standard conversion from a source type with `NA` to a destination type with +`NA` maps `NA` to `NA`. A standard conversion from a source type with `NA` to +a destination type without `NA` maps `NA` to the default value of the +destination type. For example, converting a text `NA` value to `R4` produces a +`NaN`, but converting a text `NA` to `U4` results in zero. Note that this +specification does not address diagnostic user messages, so, in certain +environments, the latter situation may generate a warning to the user. + +Note that a vector type does not support a representation of missing, but may +contain `NA` values of its item type. Generally, there is no standard +mechanism faster than O(N) for determining whether a vector with N items +contains any missing values. + +For further details on missing value representations, see the sections +detailing the particular standard primitive types. + +### Vector Representations + +Values of a vector type may be represented either sparsely or densely. A +vector type does not mandate denseness or sparsity, nor does it imply that one +is favored over the other. A sparse representation is semantically equivalent +to a dense representation having the suppressed entries filled in with the +*default* value of the item type. Note that the values of the suppressed +entries are emphatically *not* the missing/`NA` value of the item type, unless +the missing and default values are identical, as they are for key types. + +### Metadata + +A column in an `ISchema` can have additional column-wide information, known as +metadata. For each string value, known as a metadata kind, a column may have a +value associated with that metadata kind. The value also has an associated +type, which is a compatible column type. + +For example: + +* A column may indicate that it is normalized, by providing a `BL` valued + piece of metadata named `IsNormalized`. + +* A column whose type is `V`, meaning a vector of length 17 whose items + are single-precision floating-point values, might have `SlotNames` metadata + of type `V`, meaning a vector of length 17 whose items are text. + +* A column produced by a scorer may have several pieces of associated + metadata, indicating the "scoring column group id" that it belongs to, what + kind of scorer produced the column (e.g., binary classification), and the + precise semantics of the column (e.g., predicted label, raw score, + probability). + +The `ISchema` interface, including the metadata API, is fully specified in +another document. + +## Text Type + +The text type, denoted by the shorthand `TX`, represents text values. The +`TextType` class derives from `PrimitiveType` and has a single instance, +exposed as `TextType.Instance`. The representation type of `TX` is an +immutable struct known as `DvText`. A `DvText` value represents a sequence of +characters whose length is contained in its `Length` field. The missing/`NA` +value has a `Length` of -1, while all other values have a non-negative +`Length`. The default value has a `Length` of zero and represents an empty +sequence of characters. + +In text processing transformations, it is very common to split text into +pieces. A key advantage of using `DvText` instead of `System.String` for text +values is that these splits require no memory allocation—the derived `DvText` +references the same underlying `System.String` as the original `DvText` does. +Another reason that `System.String` is not ideal for text is that we want the +default value to be empty and not `NA`. For `System.String`, the default value +is null, which would be a more natural representation for `NA` than for empty +text. By using a custom struct wrapper around a portion (or span) of a +`System.String`, we address both the memory efficiency and default value +problems. + +## Boolean Type + +The standard boolean type, denoted by the shorthand `BL`, represents +true/false values. The `BooleanType` class derives from `PrimitiveType` and +has a single instance, exposed as `BooleanType.Instance`. The representation +type of `BL` is the `DvBool` enumeration type, logically stored as `sbyte`: + +`DvBool` | `sbyte` Value +--------:|:------------- +`NA` | -128 +`False` | 0 +`True` | 1 + +The default value of `BL` is `DvBool.False` and the `NA` value of `BL` is +`DvBool.NA`. Note that the underlying type of the `DvBool` `enum` is signed +byte and the default and `NA` values of `BL` align with the default and `NA` +values of `I1`. + +There is a standard conversion from `TX` to `BL`. There are standard +conversions from `BL` to all signed integer and floating point numeric types, +with `DvBool.False` mapping to zero, `DvBool.True` mapping to one, and +`DvBool.NA` mapping to `NA`. + +## Number Types + +The standard number types are all instances of the sealed class NumberType, +which is derived from PrimitiveType. There are two standard floating-point +types, four standard signed integer types, and four standard unsigned integer +types. Each of these is represented by a single instance of NumberType and +there are static properties of NumberType to access each instance. For +example, to test whether a variable type represents `I4`, use the C# code +`type == NumberType.I4`. + +Floating-point arithmetic has a well-deserved reputation for being +troublesome. This is primarily because it is imprecise, in the sense that the +result of most operations must be rounded to the nearest representable value. +This rounding means, among other side effects, that floating-point addition +and multiplication are not associate, nor satisfy the distributive property. + +However, in many ways, floating-point arithmetic is the best-suited system for +arithmetic computation. For example, the IEEE 754 specification mandates +precise graceful overflow behavior—as results grow, they lose resolution in +the least significant digits, and eventually overflow to a special infinite +value. In contrast, when integer arithmetic overflows, the result is a non- +sense value. Trapping and handling integer overflow is expensive, both in +runtime and development costs. + +The IDataView system supports integer numeric types mostly for data +interchange convenience, but we strongly discourage performing arithmetic on +those values without first converting to floating-point. + +### Floating-point Types + +The floating-point types, `R4` and `R8`, have representation types +`System.Single` and `System.Double`. Their default values are zero. Any `NaN` +is considered an `NA` value, with the specific `Single.NaN` and `Double.NaN` +values being the canonical `NA` values. + +There are standard conversions from each floating-point type to the other +floating-point type. There are also standard conversions from text to each +floating-point type and from each integer type to each floating-point type. + +### Signed Integer Types + +The signed integer types, `I1`, `I2`, `I4`, and `I8`, have representation +types Sytem.SByte, `System.Int16`, `System.Int32`, and `System.Int64`. The +default value of each of these is zero. Each of these has a non-zero value +that is its own additive inverse, namely `(-2)^^{8n-1}`, where `n` is the +number of bytes in the representation type. This is the minimum value of each +of these types. We follow R's lead and use these values as the `NA` values. + +There are standard conversions from each signed integer type to every other +signed integer type. There are also standard conversions from text to each +signed integer type and from each signed integer type to each floating-point +type. + +Note that we have not defined standard conversions from floating-point types +to signed integer types. + +### Unsigned Integer Types + +The unsigned integer types, `U1`, `U2`, `U4`, and `U8`, have representation +types Sytem.Byte, `System.UInt16`, `System.UInt32`, and `System.UInt64`, +respectively. The default value of each of these is zero. These types do not +have an `NA` value. + +There are standard conversions from each unsigned integer type to every other +unsigned integer type. There are also standard conversions from text to each +unsigned integer type and from each unsigned integer type to each floating- +point type. + +Note that we have not defined standard conversions from floating-point types +to unsigned integer types, or between signed integer types and unsigned +integer types. + +## Key Types + +Key types are used for data that is represented numerically, but where the +order and/or magnitude of the values is not semantically meaningful. For +example, hash values, social security numbers, and the index of a term in a +dictionary are all best modeled with a key type. + +The representation type of a key type, also called its underlying type, must +be one of the standard four .Net unsigned integer types. The `NA` and default +values of a key type are the same value, namely the representational value +zero. + +Key types are instances of the sealed class `KeyType`, which derives from +`PrimitiveType`. + +In addition to its underlying type, a key type specifies: + +* A count value, between `0` and `int.MaxValue`, inclusive + +* A "minimum" value, between `0` and `ulong.MaxValue`, inclusive + +* A Boolean value indicating whether the values of the key type are contiguous + +Regardless of the minimum and count values, the representational value zero +always means `NA` and the representational value one is always the first valid +value of the key type. + +Notes: + +* The `Count` property returns the count of the key type. This is of type + `int`, but is required to be non-negative. When `Count` is zero, the key + type has no known or useful maximum value. Otherwise, the legal + representation values are from one up to and including `Count`. The `Count` + is required to be representable in the underlying type, so, for example, the + `Count` value of a key type based on `System.Byte` must not exceed `255`. As + an example of the usefulness of the `Count` property, consider the + `KeyToVector` transform implemented as part of ML.NET. It maps from a key + type value to an indicator vector. The length of the vector is the `Count` + of the key type, which is required to be positive. For a key value of `k`, + with `1 ≤ k ≤ Count`, the resulting vector has a value of one in the + (`k-1`)th slot, and zero in all other slots. An `NA` value (with + representation zero) is mapped to the all- zero vector of length `Count`. + +* For a key type with positive `Count`, a representation value should be + between `0` and `Count`, inclusive, with `0` meaning `NA`. When processing + values from an untrusted source, it is best to guard against values bigger + than `Count` and treat such values as equivalent to `NA`. + +* The `Min` property returns the minimum semantic value of the key type. This + is used exclusively for transforming from a representation value, where the + valid values start at one, to user facing values, which might start at any + non-negative value. The most common values for `Min` are zero and one. + +* The boolean `Contiguous` property indicates whether values of the key type + are generally contiguous in the sense that a complete sampling of + representation values of the key type would cover most, if not all, values + from one up to their max. A `true` value indicates that using an array to + implement a map from the key type values is a reasonable choice. When + `false`, it is likely more prudent to use a hash table. + +* A key type can be non-`Contiguous` only if `Count` is zero. The converse + however is not true. A key type that is contiguous but has `Count` equal to + zero is one where there is a reasonably small maximum, but that maximum is + unknown. In this case, an array might be a good choice for a map from the + key type. + +* The shorthand for a key type with representation type `U1`, and semantic + values from `1000` to `1099`, inclusive, is `U1[1000-1099]`. Note that the + `Min` value of this key type is outside the range of the underlying type, + `System.Byte`, but the `Count` value is only `100`, which is representable + in a `System.Byte`. Recall that the representation values always start at 1 + and extend up to `Count`, in this case `100`. + +* For a key type with representation type `System.UInt32` and semantic values + starting at `1000`, with no known maximum, the shorthand is `U4[1000-*]`. + +There are standard conversions from text to each key type. This conversion +parses the text as a standard non-negative integer value and honors the `Min` +and `Count` values of the key type. If a parsed numeric value falls outside +the range indicated by `Min` and `Count`, or if the text is not parsable as a +non-negative integer, the result is `NA`. + +There are standard conversions from one key type to another, provided: + +* The source and destination key types have the same `Min` and `Count` values. + +* Either the number of bytes in the destination's underlying type is greater + than the number of bytes in the source's underlying type, or the `Count` + value is positive. In the latter case, the `Count` is necessarily less than + 2k, where k is the number of bits in the destination type's underlying type. + For example, `U1[1-*]` can be converted to `U2[1-*]`, but `U2[1-*]` cannot + be converted to `U1[1-*]`. Also, `U1[1-100]` and `U2[1-100]` can be + converted in both directions. + +## Vector Types + +### Introduction + +Vector types are one of the key innovations of the IDataView system and are +critical for high dimensional machine-learning applications. + +For example, when processing text, it is common to hash all or parts of the +text and encode the resulting hash values, first as a key type, then as +indicator or bag vectors using the `KeyToVector` transform. Using a `k`-bit +hash produces a key type with `Count` equal to `2^^k`, and vectors of the same +length. It is common to use `20` or more hash bits, producing vectors of +length a million or more. The vectors are typically very sparse. In systems +that do not support vector-valued columns, each of these million or more +values is placed in a separate (sparse) column, leading to a massive explosion +of the column space. Most tabular systems are not designed to scale to +millions of columns, and the user experience also suffers when displaying such +data. Moreover, since the vectors are very sparse, placing each value in its +own column means that, when a row is being processed, each of those sparse +columns must be queried or scanned for its current value. Effectively the +sparse matrix of values has been needlessly transposed. This is very +inefficient when there are just a few (often one) non-zero entries among the +column values. Vector types solve these issues. + +A vector type is an instance of the sealed `VectorType` class, which derives +from `ColumnType`. The vector type contains its `ItemType`, which must be a +`PrimitiveType`, and its dimensionality information. The dimensionality +information consists of one or more non-negative integer values. The +`VectorSize` is the product of the dimensions. A dimension value of zero means +that the true value of that dimension can vary from value to value. + +For example, tokenizing a text by splitting it into multiple terms generates a +vector of text of varying/unknown length. The result type shorthand is +`V`. Hashing this using `6` bits then produces the vector type +`V`. Applying the `KeyToVector` transform then produces the vector +type `V`. Each of these vector types has a `VectorSize` of zero, +indicating that the total number of slots varies, but the latter still has +potentially useful dimensionality information: the vector slots are +partitioned into an unknown number of runs of consecutive slots each of length +`64`. + +As another example, consider an image data set. The data starts with a `TX` +column containing URLs for images. Applying an `ImageLoader` transform +generates a column of a custom (non-standard) type, `Picture<*,*,4>`, where +the asterisks indicate that the picture dimensions are unknown. The last +dimension of `4` indicates that there are four channels in each pixel: the +three color components, plus the alpha channel. Applying an `ImageResizer` +transform scales and crops the images to a specified size, for example, +`100x100`, producing a type of `Picture<100,100,4>`. Finally, applying a +`ImagePixelExtractor` transform (and specifying that the alpha channel should +be dropped), produces the vector type `V`. In this example, the +`ImagePixelExtractor` re-organized the color information into separate planes, +and divided each pixel value by 256 to get pixel values between zero and one. + +### Equivalence + +Note that two vector types are equivalent when they have equivalent item types +and have identical dimensionality information. To test for compatibility, +instead of equivalence, in the sense that the total `VectorSize` should be the +same, use the `SameSizeAndItem` method instead of the Equals method (see the +`ColumnType` code below). + +### Representation Type + +The representation type of a vector type is the struct `VBuffer`, where `T` +is the representation type of the item type. For example, the representation +type of `V` is `VBuffer`. When the vector type's `VectorSize` +is positive, each value of the type will have length equal to the +`VectorSize`. + +The struct `VBuffer`, sketched below, provides both dense and sparse +representations and encourages cooperative buffer sharing. A complete +discussion of `VBuffer` and associated coding idioms is in another +document. + +Notes: + +* `VBuffer` contains four public readonly fields: `Length`, `Count`, + `Values`, and `Indices`. + +* `Length` is the logical length of the vector, and must be non-negative. + +* `Count` is the number of items explicitly represented in the vector. `Count` + is non-negative and less than or equal to Length. + +* When `Count` is equal to Length, the vector is dense. Otherwise, the vector + is sparse. + +* The `Values` array contains the explicitly represented item values. The + length of the `Values` array is at least `Count`, but not necessarily equal + to `Count`. Only the first `Count` items in `Values` are part of the vector; + any remaining items are garbage and should be ignored. Note that when + `Count` is zero, `Values` may be null. + +* The `Indices` array is only relevant when the vector is sparse. In the + sparse case, `Indices` is parallel to `Values`, only the first `Count` items + are meaningful, the indices must be non-negative and less than `Length`, and + the indices must be strictly increasing. Note that when `Count` is zero, + `Indices` may be null. In the dense case, `Indices` is not meaningful and + may or may not be null. + +* It is very common for the arrays in a `VBuffer` to be larger than needed + for their current value. A special case of this is when a dense `VBuffer` + has a non-null `Indices` array. The extra items in the arrays are not + meaningful and should be ignored. Allowing these buffers to be larger than + currently needed reduces the need to reallocate buffers for different + values. For example, when cursoring through a vector valued column with + `VectorSize` of 100, client code could pre-allocate values and indices + arrays and seed a `VBuffer` with those arrays. When fetching values, the + client code passes the `VBuffer` by reference. The called code can re-use + those arrays, filling them with the current values. + +* Generally, vectors should use a sparse representation only when the number + of non-default items is at most half the value of Length. However, this + guideline is not a mandate. + +See the full `IDataView` technical specification for additional details on +`VBuffer`, including complete discussion of programming idioms, and +information on helper classes for building and manipulating vectors. + +## Standard Conversions + +The `IDataView` system includes the definition and implementation of many +standard conversions. Standard conversions are required to map source default +values to destination default values. When both the source type and +destination type have an `NA` value, the conversion must map `NA` to `NA`. +When the source type has an `NA` value, but the destination type does not, the +conversion must map `NA` to the default value of the destination type. + +Most standard conversions are implemented by the singleton class `Conversions` +in the namespace `Microsoft.MachineLearning.Data.Conversion`. The standard +conversions are exposed by the `ConvertTransform`. + +### From Text + +There are standard conversions from `TX` to the standard primitive types, +`R4`, `R8`, `I1`, `I2`, `I4`, `I8`, `U1`, `U2`, `U4`, `U8`, and `BL`. For non- +empty, non-missing `TX` values, these conversions use standard parsing of +floating-point and integer values. For `BL`, the mapping is case insensitive, +maps text values `{ true, yes, t, y, 1, +1, + }` to `DvBool.True`, and maps +the values `{ false, no, f, n, 0, -1, - }` to `DvBool.False`. + +If parsing fails, the result is the `NA` value for floating-point, signed +integer types, and boolean, and zero for unsigned integer types. Note that +overflow of an integer type is considered failure of parsing, so produces an +`NA` (or zero for unsigned). These conversions map missing/`NA` text to `NA`, +for floating-point and signed integer types, and to zero for unsigned integer +types. + +These conversions are required to map empty text (the default value of `TX`) +to the default value of the destination, which is zero for all numeric types +and DvBool.False for `BL`. This may seem unfortunate at first glance, but +leads to some nice invariants. For example, when loading a text file with +sparse row specifications, it's desirable for the result to be the same +whether the row is first processed entirely as `TX` values, then parsed, or +processed directly into numeric values, that is, parsing as the row is +processed. In the latter case, it is simple to map implicit items (suppressed +due to sparsity) to zero. In the former case, these items are first mapped to +the empty text value. To get the same result, we need empty text to map to +zero. + +### Floating Point + +There are standard conversions from `R4` to `R8` and from `R8` to `R4`. These +are the standard IEEE 754 conversions (using unbiased round-to-nearest in the +case of `R8` to `R4`). + +### Signed Integer + +There are standard conversions from each signed integer type to each other +signed integer type. These conversions map `NA` to `NA`, map any other numeric +value that fits in the destination type to the corresponding value, and maps +any numeric value that does not fit in the destination type to `NA`. For +example, when mapping from `I1` to `I2`, the source `NA` value, namely 0x80, +is mapped to the destination `NA` value, namely 0x8000, and all other numeric +values are mapped as expected. When mapping from `I2` to `I1`, any value that +is too large in magnitude to fit in `I1`, such as 312, is mapped to `NA`, +namely 0x80. + +### Signed Integer to Floating Point + +There are standard conversions from each signed integer type to each floating- +point type. These conversions map `NA` to `NA`, and map all other values +according to the IEEE 754 specification using unbiased round-to-nearest. + +### Unsigned Integer + +There are standard conversions from each unsigned integer type to each other +unsigned integer type. These conversions map any numeric value that fits in +the destination type to the corresponding value, and maps any numeric value +that does not fit in the destination type to zero. For example, when mapping +from `U2` to `U1`, any value that is too large in magnitude to fit in `U1`, +such as 312, is mapped to zero. + +### Unsigned Integer to Floating Point + +There are standard conversions from each unsigned integer type to each +floating-point type. These conversions map all values according to the IEEE +754 specification using unbiased round-to-nearest. + +### Key Types + +There are standard conversions from one key type to another, provided: + +* The source and destination key types have the same `Min` and `Count` values. + +* Either the number of bytes in the destination's underlying type is greater + than the number of bytes in the source's underlying type, or the `Count` + value is positive. In the latter case, the `Count` is necessarily less than + `2^^k`, where `k` is the number of bits in the destination type's underlying + type. For example, `U1[1-*]` can be converted to `U2[1-*]`, but `U2[1-*]` + cannot be converted to `U1[1-*]`. Also, `U1[1-100]` and `U2[1-100]` can be + converted in both directions. + +The conversion maps source representation values to the corresponding +destination representation values. There are no special cases, because of the +requirements above. + +### Boolean to Numeric + +There are standard conversions from `BL` to each of the signed integer and +floating point numeric. These map `DvBool.True` to one, `DvBool.False` to +zero, and `DvBool.NA` to the numeric type's `NA` value. + +## Type Classes + +This chapter contains information on the C# classes used to represent column +types. Since the IDataView type system is extensible this list describes only +the core data types. + +### `ColumnType` Abstract Class + +The IDataView system includes the abstract class `ColumnType`. This is the +base class for all column types. `ColumnType` has several convenience +properties that simplify testing for common patterns. For example, the +`IsVector` property indicates whether the `ColumnType` is an instance of +`VectorType`. + +In the following notes, the symbol `type` is a variable of type `ColumnType`. + +* The `type.RawType` property indicates the representation type of the column + type. Its use should generally be restricted to constructing generic type + and method instantiations. In particular, testing whether `type.RawType == + typeof(int)` is not sufficient to test for the standard `U4` type. The + proper test is `type == NumberType.I4`, since there is a single universal + instance of the `I4` type. + +* Certain .Net types have a corresponding `DataKind` `enum` value. The value + of the `type.RawKind` property is consistent with `type.RawType`. For .Net + types that do not have a corresponding `DataKind` value, the `type.RawKind` + property returns zero. The `type.RawKind` property is particularly useful + when switching over raw type possibilities, but only after testing for the + broader kind of the type (key type, numeric type, etc.). + +* The `type.IsVector` property is equivalent to `type is VectorType`. + +* The `type.IsNumber` property is equivalent to `type is NumberType`. + +* The `type.IsText` property is equivalent to `type is TextType`. There is a + single instance of the `TextType`, so this is also equivalent to `type == + TextType.Instance`. + +* The `type.IsBool` property is equivalent to `type is BoolType`. There is a + single instance of the `BoolType`, so this is also equivalent to `type == + BoolType.Instance`. + +* Type `type.IsKey` property is equivalent to `type is KeyType`. + +* If `type` is a key type, then `type.KeyCount` is the same as + `((KeyType)type).Count`. If `type` is not a key type, then `type.KeyCount` + is zero. Note that a key type can have a `Count` value of zero, indicating + that the count is unknown, so `type.KeyCount` being zero does not imply that + `type` is not a key type. In summary, `type.KeyCount` is equivalent to: + `type is KeyType ? ((KeyType)type).Count : 0`. + +* The `type.ItemType` property is the item type of the vector type, if `type` + is a vector type, and is the same as `type` otherwise. For example, to test + for a type that is either `TX` or a vector of `TX`, one can use + `type.ItemType.IsText`. + +* The `type.IsKnownSizeVector` property is equivalent to `type.VectorSize > + 0`. + +* The `type.VectorSize` property is zero if either `type` is not a vector type + or if `type` is a vector type of unknown/variable length. Otherwise, it is + the length of vectors belonging to the type. + +* The `type.ValueCount` property is one if `type` is not a vector type and the + same as `type.VectorSize` if `type` is a vector type. + +* The `Equals` method returns whether the types are semantically equivalent. + Note that for vector types, this requires the dimensionality information to + be identical. + +* The `SameSizeAndItemType` method is the same as `Equals` for non-vector + types. For vector types, it returns true iff the two types have the same + item type and have the same `VectorSize` values. For example, for the two + vector types `V` and `V`, `Equals` returns false but + `SameSizeAndItemType` returns true. + +### `PrimitiveType` Abstract Class + +The `PrimitiveType` abstract class derives from `ColumnType` and is the base +class of all primitive type implementations. + +### `TextType` Sealed Class + +The `TextType` sealed class derives from `PrimitiveType` and is a singleton- +class for the standard text type. The instance is exposed by the static +`TextType.Instance` property. + +### `BooleanType` Sealed Class + +The `BooleanType` sealed class derives from `PrimitiveType` and is a +singleton-class for the standard boolean type. The instance is exposed by the +static `BooleanType.Instance` property. + +### `NumberType` Sealed Class + +The `NumberType` sealed class derives from `PrimitiveType` and exposes single +instances of each of the standard numeric types, `R4`, `R8`, `I1`, `I2`, `I4`, +`I8`, `U1`, `U2`, `U4`, `U8`, and `UG`. + +### `DateTimeType` Sealed Class + +The `DateTimeType` sealed class derives from `PrimitiveType` and is a +singleton-class for the standard datetime type. The instance is exposed by the +static `DateTimeType.Instance` property. + +### `DateTimeZoneType` Sealed Class + +The `DateTimeZoneType` sealed class derives from `PrimitiveType` and is a +singleton-class for the standard datetime timezone type. The instance is +exposed by the static `DateTimeType.Instance` property. + +### `TimeSpanType` Sealed Class + +The `TimeSpanType` sealed class derives from `PrimitiveType` and is a +singleton-class for the standard datetime timezone type. The instance is +exposed by the static `TimeSpanType.Instance` property. + +### `KeyType` Sealed Class + +The `KeyType` sealed class derives from `PrimitiveType` and instances +represent key types. + +Notes: + +* Two key types are considered equal iff their kind, min, count, and + contiguous values are the same. + +* The static `IsValidDataKind` method returns true iff kind is `U1`, `U2`, + `U4`, or `U8`. These are the only valid underlying data kinds for key types. + +* The inherited `KeyCount` property returns the same value as the `Count` + property. + +### `VectorType` Sealed Class + +The `VectorType` sealed class derives from `ColumnType` and instances +represent vector types. The item type is specified as the first parameter to +each constructor and the dimension information is inferred from the additional +parameters. + +* The `DimCount` property indicates the number of dimensions and the `GetDim` + method returns a particular dimension value. All dimension values are non- + negative integers. A dimension value of zero indicates unknown (or variable) + in that dimension. + +* The `VectorSize` property returns the product of the dimensions. + +* The `IsSubtypeOf(VectorType other)` method returns true if this is a subtype + of `other`, in the sense that they have the same item type, and either have + the same `VectorSize` or `other.VectorSize` is zero. + +* The inherited `Equals` method returns true if the two types have the same + item type and the same dimension information. + +* The inherited `SameSizeAndItemType(ColumnType other)` method returns true if + `other` is a vector type with the same item type and the same `VectorSize` + value. diff --git a/docs/code/IdvFileFormat.md b/docs/code/IdvFileFormat.md new file mode 100644 index 0000000000..4009eed726 --- /dev/null +++ b/docs/code/IdvFileFormat.md @@ -0,0 +1,191 @@ +# IDV File Format + +This document describes ML.NET's Binary dataview file format, version 1.1.1.5 +written by the `BinarySaver` and `BinaryLoader` classes, commonly known as the +`.idv` format. + +## Goal of the Format + +A dataview is a collection of columns, over some number of rows. (Do not +confuse column with features. Columns can be and often are vector valued, and +it is expected though not required that commonly all features will be together +in one vector valued column.) + +The actual values are stored in blocks. A block holds values for a single +column across multiple rows. Block format is dictated by a codec. There is a +table-of-contents and lookup table to facilitate quasi-random access to +particular blocks. (Quasi in the sense that you can only seek to a block, not +to a particular within a block.) + +## General Data Format + +Before we discuss the format itself we will establish some conventions on how +individual scalar values, strings, and other data is serialized. All basic +pieces of data (e.g., a single number, or a single string) are encoded in ways +reflecting the semantics of the .NET `BinaryWriter` class, those semantics +being: + +* All numbers are stored as little-endian, using their natural fix-length + binary encoding. + +* Strings are stored using an unsigned + [LEB128](https://en.wikipedia.org/wiki/LEB128) number describing the number + of bytes, followed by that many bytes containing the UTF-8 encoded string. + +A note about this: LEB128 is a simple encoding to encode arbitrarily large +integers. Each byte of 8-bits follows this convention. The most significant +bit is 0 if and only if this is the end of the LEB128 encoding. The remaining +7 bits are a part of the number being encoded. The bytes are stored +little-endian, that is, the first byte holds the 7 least significant bits, the +second byte (if applicable) holds the next 7 least significant bits, etc., and +the last byte holds the 7 most significant bits. LEB128 is used one or two +places in this format. (I might tend to prefer use of LEB128 in places where +we are writing values that, on balance, we expect to be relatively small, and +only in cases where there is no potential for benefit for random access to the +associated stream, since LEB128 is incompatible with random access. However, +this is not formulated into anything approaching a definite policy.) + +## Header + +Every binary instances stream has a header composed of 256 bytes, at the start +of the stream. Not all bytes are used. Those bytes that are not explicitly +used have undefined content, and can have anything in them. We strongly +encourage writers of this format to insert obscene messages in this dead +space. The content is defined as follows (the offsets being the start of that +column). + +Offsets | Type | Name and Description +--------|-------|--------------------- +0 | ulong | **Signature**: The magic number of this file. +8 | ulong | **Version**: Indicates the version of the data file. +16 | ulong | **CompatibleVersion**: Indicates the minimum reader version that can interpret this file, possibly with some data loss. +24 | long | **TableOfContentsOffset**: The offset to the column table of contents structure. +32 | long | **TailOffset**: The eight-byte tail signature starts at this offset. So, the entire dataset stream should be considered to have byte length of eight plus this value. +40 | long | **RowCount**: The number of rows in this data file. +48 | int | **ColumnCount**: The number of columns in this data file. + +Notes on these: + +* The signature of this file is `0x00425644004C4D43`, which is, when written + little-endian to a file, `CML DVB ` with null characters in the place of + spaces. These letters are intended to suggest "CloudML DataView Binary." + +* The tail signature is the byte-reversed version of this, that is, + `0x434D4C0044564200`. + +* Versions are encoded as four 16-bit unsigned numbers passed into a single + ulong, with higher order bits being a more major version. The first + supported version of the is 1.1.1.4, that is, `0x0001000100010004`. + (Versions prior to 1.1.1.4 did exist, but were not released, so we do not + support them, though we do describe them in this document for the sake of + completeness.) + +## Table of Contents Format + +The table of contents are packed entries, with there being as many entries as +there are columns. The version field here indicates the versions where that +entry is written. ≥ indicates the field occurred in versions after and +including that version, = indicates the field occurs only in that version. + +Description | Entry Type | Version +------------|------------|-------- +Column name | string | ≥1.1.1.1 +Codec loadname | string | ≥1.1.1.1 +Codec parameterization length | LEB128 integer | ≥1.1.1.1 +Codec parameterization, which must have precisely the length indicated above | arbitrary, but with specified length | ≥1.1.1.1 +Compression kind | CompressionKind (byte) | ≥1.1.1.1 +Rows per block in this column | LEB128 integer | ≥1.1.1.1 +Lookup table offset | long | ≥1.1.1.1 +Slot names offset, or 0 if this column has no slot names, if 1.1.1.2 behave as if there are no slot names, with this having value 0) | long | =1.1.1.3 +Slot names byte size (present only if slot names offset is greater than 0) | long | =1.1.1.3 +Slot names count (present only if slot names offset is greater than 0) | int | =1.1.1.3 +Metadata table of contents offset, or 0 if there is no metadata (1.1.1.4) | long | ≥1.1.1.4 + +For those working in the ML.NET codebase: The three `Codec` fields are handled +by the `CodecFactory.WriteCodec/TryReadCodec` methods, with the definition +stream being at the start of the codec loadname, and being at the end of the +codec parameterization, both in the case of success or failure. + +CompressionCodec enums are described below, and describe the compression +algorithm used to compress blocks. + +### Compression Kind + +The enum for compression kind is one byte, and follows this scheme: + +Compression Kind | Code +---------------------------------------------------------------|----- +None | 0 +DEFLATE (i.e., [RFC1951](http://www.ietf.org/rfc/rfc1951.txt)) | 1 +zlib (i.e., [RFC1950](http://www.ietf.org/rfc/rfc1950.txt)) | 2 + +None means no compression. DEFLATE is the default scheme. There is a tendency +to conflate zlib and DEFLATE, so to be clear: zlib can be (somewhat inexactly) +considered a wrapped version of DEFLATE, but it is still a distinct (but +closely related) format. However, both are implemented by the zlib library, +which is probably the source of the confusion. + +## Metadata Table of Contents Format + +The metadata table of contents begins with a LEB128 integer describing the +number of entries. (Should be a positive value, since if a column has no +metadata the expectation is that the offset for the metadata TOC will be +stored as 0.) What follows that are that many packed entries. Each entry is +somewhat akin to the column table of contents entry, with some simplifications +considering that there will be exactly one "block" with one item. + +Description | Entry Type +-------------------------------------------------------|------------ +Metadata kind | string +Codec loadname | string +Codec parameterization length | LEB128 integer +Codec parameterization, which must have precisely the length indicated above | arbitrary, but with specified length +Compression kind | CompressionKind(byte) +Offset of the block where the metadata item is written | long +Byte length of the block | LEB128 integer + +The "block" written is written in exactly same format as the main content +blocks. This will be very slightly inefficient as that scheme is sometimes +written to accommodate many entries, but I don't expect that to be much of a +burden. + +## Lookup Table Format + +Each table of contents entry is associated with a lookup table starting at the +indicated lookup table offset. It is written as packed binary, with each +lookup entry consisting of 16 bytes. So in all, the lookup table takes 16 +bytes, times the total number of blocks for this column. + +Description | Entry Type +----------------------------------------------------------|----------- +Block offset, position in the file where the block starts | long +Block length, its size in bytes in the file | int +Uncompressed block length, its size in bytes if the block bytes were decompressed according to the column's compression codec | int + +## Slot Names + +If slot names are stored, they are stored as pairs of integer index/string +pairs. As many pairs are stored as count of slot names were present in the +table of contents entry. Note that this only appeared in version 1.1.1.3. With +1.1.1.4 and later, slot names were just considered yet another piece of +metadata. + +Description | Entry Type +------------------|----------- +Index of the slot | int +The slot name | string + +## Block Format + +Columns are ordered into blocks, with each block holding the binary encoded +values for one particular columns across a range of rows. So for example, if +the column's table of contents describes it as having 1000 rows per block, the +first block will contain the values for the column for rows 0 through 999, +second block 1000 through 1999, etc., with all blocks containing the same +number of blocks, except the last block which will contain fewer items (unless +the number of rows just so happens to be a multiple of the block size). + +Each column is a possibly compressed sequence of bytes, compressed according +to the compression type field in the table of contents. It begins and ends at +the offsets indicated in the metadata entry stored in the directory. The +uncompressed bytes will be stored in the format as described by the codec. diff --git a/docs/code/KeyValues.md b/docs/code/KeyValues.md new file mode 100644 index 0000000000..ced135761d --- /dev/null +++ b/docs/code/KeyValues.md @@ -0,0 +1,149 @@ +# Key Values + +Most commonly, key-values are used to encode items where it is convenient or +efficient to represent values using numbers, but you want to maintain the +logical "idea" that these numbers are keys indexing some underlying, implicit +set of values, in a way more explicit than simply mapping to a number would +allow you to do. + +A more formal description of key values and types is +[here](IDataViewTypeSystem.md#key-types). *This* document's motivation is less +to describe what key types and values are, and more to instead describe why +key types are necessary and helpful things to have. Necessarily, this document, +is more anecdotal in its descriptions to motivate its content. + +Let's take a few examples of transforms that produce keys: + +* The `TermTransform` forms a dictionary of unique observed values to a key. + The key type's count indicates the number of items in the set, and through + the `KeyValue` metadata "remembers" what each key is representing. + +* The `HashTransform` performs a hash of input values, and produces a key + value with count equal to the range of the hash function, which, if a b bit + hash was used, will produce a 2ᵇ hash. + +* The `CharTokenizeTransform` will take input strings and produce key values + representing the characters observed in the string. + +## Keys as Intermediate Values + +Explicitly invoking transforms that produce key values, and using those key +values, is sometimes helpful. However, given that most trainers expect the +feature vector to be a vector of floating point values and *not* keys, in +typical usage the majority of usages of keys is as some sort of intermediate +value on the way to that final feature vector. (Unless, say, doing something +like preparing labels for a multiclass learner.) + +So why not go directly to the feature vector, and forget this key stuff? +Actually, to take text as the canonical example, we used to. However, by +structuring the transforms from, say, text to key to vector, rather than text +to vector *directly*, we are able to simplify a lot of code on the +implementation side, which is both less for us to maintain, and also for users +gives consistency in behavior. + +So for example, the `CharTokenize` above might appear to be a strange choice: +*why* represent characters as keys? The reason is that the ngram transform is +written to ingest keys, not text, and so we can use the same transform for +both the n-gram featurization of words, as well as n-char grams. + +Now, much of this complexity is hidden from the user: most users will just use +the `text` transform, select some options for n-grams, and chargrams, and not +be aware of these internal invisible keys. Similarly, use the categorical or +categorical hash transforms, without knowing that internally it is just the +term or hash transform followed by a `KeyToVector` transform. But, keys are +still there, and it would be impossible to really understand ML.NET's +featurization pipeline without understanding keys. Any user that wants to +understand how, say, the text transform resulted in a particular featurization +will have to inspect the key values to get that understanding. + +## Keys are not Numbers + +As an actual CLR data type, key values are stored as some form of unsigned +integer (most commonly `uint`). The most common confusion that arises from +this is to ascribe too much importance to the fact that it is a `uint`, and +think these are somehow just numbers. This is incorrect. + +For keys, the concept of order and difference has no inherent, real meaning as +it does for numbers, or at least, the meaning is different and highly domain +dependent. Consider a numeric `U4` type, with values `0`, `1`, and `2`. The +difference between `0` and `1` is `1`, and the difference between `1` and `2` +is `1`, because they're numbers. Very well: now consider that you train a term +transform over the input tokens `apple`, `pear`, and `orange`: this will also +map to the keys logically represented as the numbers `0`, `1`, and `2` +respectively. Yet for a key, is the difference between keys `0` and `1`, `1`? +No, the difference is `0` maps to `apple` and `1` to `pear`. Also order +doesn't mean one key is somehow "larger," it just means we saw one before +another -- or something else, if sorting by value happened to be selected. + +Also: ML.NET's vectors can be sparse. Implicit entries in a sparse vector are +assumed to have the `default` value for that type -- that is, implicit values +for numeric types will be zero. But what would be the implicit default value +for a key value be? Take the `apple`, `pear`, and `orange` example above -- it +would inappropriate for the default value to be `0`, because that means the +result is `apple`, would be appropriate. The only really appropriate "default" +choice is that the value is unknown, that is, missing. + +An implication of this is that there is a distinction between the logical +value of a key-value, and the actual physical value of the value in the +underlying type. This will be covered more later. + +## As an Enumeration of a Set: `KeyValues` Metadata + +While keys can be used for many purposes, they are often used to enumerate +items from some underlying set. In order to map keys back to this original +set, many transform producing key values will also produce `KeyValues` +metadata associated with that output column. + +Valid `KeyValues` metadata is a vector of length equal to the count of the +type of the column. This can be of varying types: it is often text, but does +not need to be. For example, a `term` applied to a column would have +`KeyValue` metadata of item type equal to the item type of the input data. + +How this metadata is used downstream depends on the purposes of who is +consuming it, but common uses are: in multiclass classification, for +determining the human readable class names, or if used in featurization, +determining the names of the features. + +Note that `KeyValues` data is optional, and sometimes is not even sensible. +For example, if we consider a clustering algorithm, the prediction of the +cluster of an example would. So for example, if there were five clusters, then +the prediction would indicate the cluster by `U4<0-4>`. Yet, these clusters +were found by the algorithm itself, and they have no natural descriptions. + +## Actual Implementation + +This may be of use only to writers or extenders of ML.NET, or users of our +API. How key values are presented *logically* to users of ML.NET, is distinct +from how they are actually stored *physically* in actual memory, both in +ML.NET source and through the API. For key values: + +* All key values are stored in unsigned integers. +* The missing key values is always stored as `0`. See the note above about the + default value, to see why this must be so. +* Valid non-missing key values are stored from `1`, onwards, irrespective of +whatever we claim in the key type that minimum value is. + +So when, in the prior example, the term transform would map `apple`, `pear`, +and `orange` seemingly to `0`, `1`, and `2`, values of `U4<0-2>`, in reality, +if you were to fire up the debugger you would see that they were stored with +`1`, `2`, and `3`, with unrecognized values being mapped to the "default" +missing value of `0`. + +Nevertheless, we almost never talk about this, no more than we would talk +about our "strings" really being implemented as string slices: this is purely +an implementation detail, relevant only to people working with key values at +the source level. To a regular non-API user of ML.NET, key values appear +*externally* to be simply values, just as strings appear to be simply strings, +and so forth. + +There is another implication: a hypothetical type `U1<4000-4002>` is actually +a sensible type in this scheme. The `U1` indicates that is stored in one byte, +which would on first glance seem to conflict with values like `4000`, but +remember that the first valid key-value is stored as `1`, and we've identified +the valid range as spanning the three values 4000 through 4002. That is, +`4000` would be represented physically as `1`. + +The reality cannot be seen by any conventional means I am aware of, save for +viewing ML.NET's workings in the debugger or using the API and inspecting +these raw values yourself: that `4000` you would see is really stored as the +`byte` `1`, `4001` as `2`, `4002` as `3`, and a missing value stored as `0`. \ No newline at end of file diff --git a/docs/code/VBufferCareFeeding.md b/docs/code/VBufferCareFeeding.md new file mode 100644 index 0000000000..1de7239dc6 --- /dev/null +++ b/docs/code/VBufferCareFeeding.md @@ -0,0 +1,270 @@ +# `VBuffer` Care and Feeding + +The `VBuffer` is ML.NET's central vector type, used throughout our data +pipeline and many other places to represent vectors of values. For example, +nearly all trainers accept feature vectors as `VBuffer`. + +## Technical `VBuffers` + +A `VBuffer` is a generic type that supports both dense and sparse vectors +over items of type `T`. This is the representation type for all +[`VectorType`](IDataViewTypeSystem.md#vector-representations) instances in the +`IDataView` ecosystem. When an instance of this is passed to a row cursor +getter, the callee is free to take ownership of and re-use the arrays +(`Values` and `Indices`). + +A `VBuffer` is a struct, and has the following `readonly` fields: + +* `int Length`: The logical length of the buffer. + +* `int Count`: The number of items explicitly represented. This equals `Length` +when the representation is dense and is less than `Length` when sparse. + +* `T[] Values`: The values. Only the first `Count` of these are valid. + +* `int[] Indices`: The indices. For a dense representation, this array is not + used, and may be `null`. For a sparse representation it is parallel to + values and specifies the logical indices for the corresponding values. Only + the first `Count` of these are valid. + +`Values` must have length equal to at least `Count`. If the representation is +sparse, that is, `Count < Length`, then `Indices` must have length also +greater than or equal to `Count`. If `Count == 0`, then it is entirely legal +for `Values` or `Indices` to be `null`, and if dense then `Indices` can always +be `null`. + +On the subject of `Count == 0`, note that having no valid values in `Indices` +and `Values` merely means that no values are explicitly defined, and the +vector should be treated, logically, as being filled with `default(T)`. + +For sparse vectors, `Indices` must have length equal to at least `Count`, and +the first `Count` indices must be increasing, with all indices between `0` +inclusive and `Length` exclusive. + +Regarding the generic type parameter `T`, the only real assumption made about +this type is that assignment (that is, using `=`) is sufficient to create an +*independent* copy of that item. All representation types of the [primitive +types](IDataViewTypeSystem.md#standard-column-types) have this property (e.g., +`DvText`, `DvInt4`, `Single`, `Double`, etc.), but for example, `VBuffer<>` +itself does not have this property. So, no `VBuffer` of `VBuffer`s for you. + +## Sparse Values as `default(T)` + +Any implicit value in a sparse `VBuffer` **must** logically be treated as +though it has value `default(T)`. For example, suppose we have the following +two declarations: + +```csharp +var a = new VBuffer(5, new float[] { 0, 1, 0, 0, 2 }); +var b = new VBuffer(5, 2, new float[] { 1, 2 }, new int[] { 1, 4 }); +``` + +Here, `a` is dense, and `b` is sparse. However, any operations over either +must treat the logical indices `0`, `2`, and `3` as if they have value `0.0f`. +The two should be equivalent! + +ML.NET throughout its codebase assumes in many places that sparse and dense +representations are interchangeable: if it is more efficient to consider +something sparse or dense, the code will have no qualms about making that +conversion. This does mean though, that we depend upon all code that deals +with `VBuffer` responding in the same fashion, and respecting this convention. + +As a corollary to the above note about equivalence of sparse and dense +representations, since they are equivalent it follows that any code consuming +`VBuffer`s must work equally well with *both*. That is, there must never be a +condition where data is read and assumed to be either sparse, or dense, since +implementers of `IDataView` and related interfaces are perfectly free to +produce either. + +The only "exception" to this rule is a necessary acknowledgment of the reality +of floating point mathematics: sometimes due to the way the JIT will optimize +code one code path or another, and due to the fact that floating point math is +not commutative, operations over sparse `VBuffer` or `VBuffer` +vectors can sometimes result in modestly different results than the "same" +operation over dense values. + +## Why Buffer Reuse + +The question is often asked by people new to this codebase: why bother with +buffer reuse at all? Without going into too many details, we used to not and +suffered for it. We had a far simpler system where examples were yielded +through an +[`IEnumerable<>`](https://msdn.microsoft.com/en-us/library/9eekhta0.aspx), and +our vector type at the time had `Indices` and `Values` arrays as well, but +their sizes were there actual sizes, and being returned through an +`IEnumerable<>` there was no plausible way to "recycle" the buffers. + +Also: who "owned" a fetched example (the caller, or callee) was not clear. +Because it was not clear, code was inevitably written and checked in that made +*either* assumption, which meant, ultimately, that everything that touched +these would try to duplicate everything by default, because doing anything +else would fail in some case. + +The reason why this becomes important is because [garbage +collection](https://msdn.microsoft.com/en-us/library/0xy59wtx.aspx) in the +.NET framework is not free. Creating and destroying these arrays *can* be +cheap, provided that they are sufficiently small, short lived, and only ever +exist in a single thread. But, violate any of these, there is a possibility +these arrays could be allocated on the large object heap, or promoted to gen-2 +collection. The results could be disastrous: in one particularly memorable +incident regarding neural net training, the move to `IDataView` and its +`VBuffer`s resulted in a more than tenfold decrease in runtime performance, +because under the old regime the garbage collection of the feature vectors was +just taking so much time. + +This is somewhat unfortunate: a joke-that's-not-really-a-joke on the team was +that we were writing C# as though it were C code. Be that as it may, buffer +reuse is essential to our performance, especially on larger problems. + +This design requirement of buffer reuse has deeper implications for the +ecosystem merely than the type here. For example, it is one crucial reason why +so many value accessors in the `IDataView` ecosystem fill in values passed in +through a `ref` parameter, rather than, say, being a return value. + +## Buffer Re-use as a User + +Let's imagine we have an `IDataView` in a variable `dataview`, and we just so +happen to know that the column with index 5 has representation type +`VBuffer`. (In real code, this would presumably we achieved through +more complicated involving an inspection of `dataview.Schema`, but we omit +such details here.) + +```csharp +using (IRowCursor cursor = dataview.GetRowCursor(col => col == 5)) +{ + ValueGetter> getter = cursor.GetGetter>(5); + var value = default(VBuffer); + while (cursor.MoveNext()) + { + getter(ref value); + // Presumably something else is done with value. + } +} +``` + +In this example, we open a cursor (telling it to make only column 5 active), +then get the "getter" over this column. What enables buffer re-use for this is +that, as we go row by row over the data with the `while` loop, we pass in the +same `value` variable in to the `getter` delegate, again and again. Presumably +the first time, or several, memory is allocated. Initially `value = +default(VBuffer)`, that is, it has zero `Length` and `Count` and `null` +`Indices` and `Values`. Presumably at some point, probably the first call, +`value` is replaced with a `VBuffer` that has actual values allocated. +In subsequent calls, perhaps these are judged as insufficiently large, and new +arrays are allocated, but we would expect at some point the arrays would +become "large enough" to accommodate many values, so reallocations would +become increasingly rare. + +A common mistake made by first time users is to do something like move the +`var value` declaration inside the `while` loop, thus dooming `getter` to have +to allocate the arrays every single time, completely defeating the purpose of +buffer reuse. + +## Buffer Re-use as a Developer + +Nearly all methods in ML.NET that "return" a `VBuffer` do not really return +a `VBuffer` *at all*, but instead have a parameter `ref VBuffer dst`, +where they are expected to put the result. See the above example, with the +`getter`. A `ValueGetter` is defined: + +```csharp +public delegate void ValueGetter(ref TValue value); +``` + +Let's describe the typical practice of "returning" a `VBuffer` in, say, a +`ref` parameter named `dst`: if `dst.Indices` and `dst.Values` are +sufficiently large to contain the result, they are used, and the value is +calculated, or sometimes copied, into them. If either is insufficiently large, +then a new array is allocated in its place. After all the calculation happens, +a *new* `VBuffer` is constructed and assigned to `dst`. (And possibly, if they +were large enough, using the same `Indices` and `Values` arrays as were passed +in, albeit with different values.) + +`VBuffer`s can be either sparse or dense. However, even when returning a dense +`VBuffer`, you would not discard the `Indices` array of the passed in buffer, +assuming there was one. The `Indices` array was merely larger than necessary +to store *this* result: that you happened to not need it this call does not +justify throwing it away. We don't care about buffer re-use just for a single +call, after all! The dense constructor for the `VBuffer` accepts an `Indices` +array for precisely this reason! + +Also note: when you return a `VBuffer` in this fashion, the caller is assumed +to *own* it at that point. This means they can do whatever they like to it, +like pass the same variable into some other getter, or modify its values. +Indeed, this is quite common: normalizers in ML.NET get values from their +source, then immediately scale the contents of `Values` appropriately. This +would hardly be possible if the callee was considered to have some stake in +that result. + +There is a corollary on this point: because the caller owns any `VBuffer`, +then you shouldn't do anything that irrevocably destroys their usefulness to +the caller. For example, consider this method that takes a vector `src`, and +stores the scaled result in `dst`. + +```csharp +VectorUtils.ScaleBy(ref VBuffer src, ref VBuffer dst, float c) +``` + +What this does is, copy the values from `src` to `dst`, while scaling each +value seen by `c`. + +One possible alternate (wrong) implementation of this would be to just say +`dst=src` then scale all contents of `dst.Values` by `c`. But, then `dst` and +`src` would share references to their internal arrays, completely compromising +the caller's ability to do anything useful with them: if the caller were to +pass `dst` into some other method that modified it, this could easily +(silently!) modify the contents of `src`. The point is: if you are writing +code *anywhere* whose end result is that two distinct `VBuffer` structs share +references to their internal arrays, you've almost certainly introduced a +**nasty** pernicious bug for your users. + +## Utilities for Working with `VBuffer`s + +ML.NET's runtime code has a number of utilities for operating over `VBuffer`s +that we have written to be generally useful. We will not treat on these in +detail here, but: + +* `Microsoft.ML.Runtime.Data.VBuffer` itself contains a few methods for + accessing and iterating over its values. + +* `Microsoft.ML.Runtime.Internal.Utilities.VBufferUtils` contains utilities + mainly for non-numeric manipulation of `VBuffer`s. + +* `Microsoft.ML.Runtime.Numeric.VectorUtils` contains math operations + over `VBuffer` and `float[]`, like computing norms, dot-products, and + whatnot. + +* `Microsoft.ML.Runtime.Data.BufferBuilder` is an abstract class whose + concrete implementations are used throughout ML.NET to build up `VBuffer` + instances. Note that if one *can* simply build a `VBuffer` oneself easily + and do not need the niceties provided by the buffer builder, you should + probably just do it yourself. + +* `Microsoft.MachineLearning.Internal.Utilities.EnsureSize` is often useful to +ensure that the arrays are of the right size. + +## Golden Rules + +Here are some golden rules to remember: + +Remember the conditions under which `Indices` and `Values` can be `null`! A +developer forgetting that `null` values for these fields are legal is probably +the most common error in our code. (And unfortunately one that sometimes takes +a while to pop up: most users don't feed in empty inputs to our trainers.) + +In terms of accessing anything in `Values` or `Indices`, remember, treat +`Count` as the real length of these arrays, not the actual length of the +arrays. + +If you write code that results in two distinct `VBuffer`s sharing references +to their internal arrays, (e.g., there are two `VBuffer`s `a` and `b`, with +`a.Indices == b.Indices` with `a.Indices != null`, or `a.Values == b.Values` +with `a.Values != null`) then you've almost certainly done something wrong. + +Structure your code so that `VBuffer`s have their buffers re-used as much as +possible. If you have code called repeatedly where you are passing in some +`default(VBuffer)`, there's almost certainly an opportunity there. + +When re-using a `VBuffer` that's been passed to you, remember that even when +constructing a dense vector, you should still re-use the `Indices` array that +was passed in. \ No newline at end of file diff --git a/docs/release-notes/0.1/release-0.1.md b/docs/release-notes/0.1/release-0.1.md index def4723a31..a36055527a 100644 --- a/docs/release-notes/0.1/release-0.1.md +++ b/docs/release-notes/0.1/release-0.1.md @@ -13,7 +13,7 @@ dotnet add package Microsoft.ML From package manager: ``` -Install-Package Microsoft.ML +Install-Package Microsoft.ML ``` Or from within Visual Studio's NuGet package manager. diff --git a/run.cmd b/run.cmd index be63160b5e..616c7cc018 100644 --- a/run.cmd +++ b/run.cmd @@ -11,7 +11,7 @@ set DOTNET_SKIP_FIRST_TIME_EXPERIENCE=1 set DOTNET_MULTILEVEL_LOOKUP=0 :: Restore the Tools directory -call %~dp0init-tools.cmd +call "%~dp0init-tools.cmd" if NOT [%ERRORLEVEL%]==[0] exit /b 1 set _toolRuntime=%~dp0Tools @@ -21,8 +21,8 @@ set _json=%~dp0config.json :: run.exe depends on running in the root directory, notably because the config.json specifies :: a relative path to the binclash logger -pushd %~dp0 -call %_dotnet% %_toolRuntime%\run.exe "%_json%" %* +pushd "%~dp0" +call "%_dotnet%" "%_toolRuntime%\run.exe" "%_json%" %* popd exit /b %ERRORLEVEL% \ No newline at end of file diff --git a/src/Microsoft.ML.Core/Data/ICursor.md b/src/Microsoft.ML.Core/Data/ICursor.md new file mode 100644 index 0000000000..403107acc6 --- /dev/null +++ b/src/Microsoft.ML.Core/Data/ICursor.md @@ -0,0 +1,174 @@ +# `ICursor` Notes + +This document includes some more in depth notes on some expert topics for +`ICursor` implementations. + +## `Batch` + +Some cursorable implementations, like `IDataView`, can through +`GetRowCursorSet` return a set of parallel cursors that partition the sequence +of rows as would have normally been returned through a plain old +`GetRowCursor`, just sharded into multiple cursors. These cursors can be +accessed across multiple threads to enable parallel evaluation of a data +pipeline. This is key for the data pipeline performance. + +However, even though the data pipeline can perform this parallel evaluation, +at the end of this parallelization we usually ultimately want to recombine the +separate thread's streams back into a single stream. This is accomplished +through `Batch`. + +So, to review what actually happens in ML.NET code: multiple cursors are +returned through a method like `IDataView.GetRowCursorSet`. Operations can +happen on top of these cursors -- most commonly, transforms creating new +cursors on top of them -- and the `IRowCursorConsolidator` implementation will +utilize this `Batch` field to "reconcile" the multiple cursors back down into +one cursor. + +It may help to first understand this process intuitively, to understand +`Batch`'s requirements: when we reconcile the outputs of multiple cursors, the +consolidator will take the set of cursors. It will find the one with the +"lowest" `Batch` ID. (This must be uniquely determined: that is, no two +cursors should ever return the same `Batch` value.) It will iterate on that +cursor until the `Batch` ID changes. Whereupon, the consolidator will find the +next cursor with the next lowest batch ID (which should be greater, of course, +than the `Batch` value we were just iterating on). + +Put another way: if we called `GetRowCursor` (possibly with an `IRandom` +instance), and we store all the values from the rows from that cursoring in +some list, in order. Now, imagine we create `GetRowCursorSet` (with an +identically constructed `IRandom` instance), and store the values from the +rows from the cursorings from all of them in a different list, in order, +accompanied by their `Batch` value. Then: if we were to perform a *stable* +sort on the second list keyed by the stored `Batch` value, it should have +content identical to the first list. + +So: `Batch` is a `long` value associated with every `ICounted` implementation +(including implementations of `ICursor`). This quantity must be: + +Non-decreasing as we call `MoveNext` or `MoveMany`. That is, it is fine for +the `Batch` to repeat the same batch value within the same cursor (though not +across cursors from the same set), but any change in the value must be an +increase. + +The requirement of consistency is for one cursor or cursors from a *single* +call to `GetRowCursor` or `GetRowCursorSet`. It is not required that the +`Batch` be consistent among multiple independent cursorings. + +## `MoveNext` and `MoveMany` + +Once `MoveNext` or `MoveMany` returns `false`, naturally all subsequent calls +to either of these two methods should return `false`. It is important that +they not throw, return `true`, or have any other behavior. + +## `GetIdGetter` + +This treats on the requirements of a proper `GetIdGetter` implementation. + +It is common for objects to serve multiple `ICounted` instances to iterate +over what is supposed to be the same data, e.g., in an `IDataView` a cursor +set will produce the same data as a serial cursor, just partitioned, and a +shuffled cursor will produce the same data as a serial cursor or any other +shuffled cursor, only shuffled. The ID exists for applications that need to +reconcile which entry is actually which. Ideally this ID should be unique, but +for practical reasons, it suffices if collisions are simply extremely +improbable. + +To be specific, the original case motivating this functionality was SDCA where +it is both simultaneously important that we see data in a "random-enough" +fashion (so shuffled), but each instance has an associated dual variable. The +ID is used to associate each instance with the corresponding dual variable +across multiple iterations of the data. (Note that in this specific +application collisions merely being improbable is sufficient, since if there +was hypothetically a collision it would not actually probably materially +affect the results anyway, though I'm making that claim without +justification). + +Note that this ID, while it must be consistent for multiple streams according +to the semantics above, is not considered part of the data per se. So, to take +the example of a data view specifically, a single data view must render +consistent IDs across all cursorings, but there is no suggestion at all that +if the "same" data were presented in a different data view (as by, say, being +transformed, cached, saved, or whatever), that the IDs between the two +different data views would have any discernable relationship. + +Since this ID is practically often derived from the IDs of some other +`ICounted` (e.g., for a transform, the IDs of the output are usually derived +from the IDs of the input), it is not only necessary to claim that the ID +generated here is probabilistically unique, but also describe a procedure or +set of guidelines implementors of this method should attempt to follow, in +order to ensure that downstream components have a fair shake at producing +unique IDs themselves. + +Duplicate IDs being improbable is practically accomplished with a +hashing-derived mechanism. For this we have the `UInt128` methods `Fork`, +`Next`, and `Combine`. See their documentation for specifics, but they all +have in common that they treat the `UInt128` as some sort of intermediate hash +state, then return a new hash state based on hashing of a block of additional +'bits.' (Since the bits hashed may be fixed, depending on the operation, this +can be very efficient.) The basic assumption underlying all of that collisions +between two different hash states on the same data, or hashes on the same hash +state on different data, are unlikely to collide. Note that this is also the +reason why `UInt128` was introduced; collisions become likely when we have the +number of elements on the order of the square root of the hash space. The +square root of `UInt64.MaxValue` is only several billion, a totally reasonable +number of instances in a dataset, whereas a collision in a 128-bit space is +less likely. + +Let's consider the IDs of a collection of entities, then, to be ideally an +"acceptable set." An "acceptable set" is one that is not especially or +perversely likely to contain collisions versus other sets, and also one +unlikely to result in an especially or perversely likely to collide set of +IDs, so long as the IDs are done according to the following operations that +operate on acceptable sets. + +1. The simple enumeration of `UInt128` numeric values from any number is an + acceptable set. (This covers how most loaders generate IDs. Typically, we + start from 0, but other choices, like -1, are acceptable.) + +2. The subset of any acceptable set is an acceptable set. (For example, all + filter transforms that map any input row to 0 or 1 output rows, can just + pass through the input cursor's IDs.) + +3. Applying `Fork` to every element of an acceptable set exactly once will + result in an acceptable set. + +4. As a generalization of the above, if for each element of an acceptable set, + you built the set comprised of the single application of `Fork` on that ID + followed by the set of any number of application of `Next`, the union of + all such sets would itself be an acceptable set. (This is useful, for + example, for operations that produce multiple items per input item. So, if + you produced two rows based on every single input row, if the input ID were + _id_, then, the ID of the first row could be `Fork` of _id_, and the second + row could have ID of `Fork` then `Next` of the same _id_.) + +5. If you have potentially multiple acceptable sets, while the union of them + obviously might not be acceptable, if you were to form a mapping from each + set, to a different ID of some other acceptable set (each such ID should be + different), and then for each such set/ID pairing, create the set created + from `Combine` of the items of that set with that ID, and then union of + those sets will be acceptable. (This is useful, for example, if you had + something like a join, or a Cartesian product transform, or something like + that.) + +6. Moreover, similar to the note about the use of `Fork`, and `Next`, if + during the creation of one of those sets describe above, you were to form + for each item of that set, a set resulting from multiple applications of + `Next`, the union of all those would also be an acceptable set. + +This list is not exhaustive. Other operations I have not listed above might +result in an acceptable set as well, but one should not attempt other +operations without being absolutely certain of what one is doing. The general +idea is that one should structure the construction of IDs, so that it will +never arise that the same ID is hashed against the same data, and are +introduced as if we expect them to be two separate IDs. + +Of course, with a malicious actor upstream, collisions are possible and can be +engineered quite trivially (e.g., just by returning a constant ID for all +rows), but we're not supposing that the input `IDataView` is maliciously +engineering hash states, or applying the operations above in any strange way +to attempt to induce collisions. E.g., you could take, operation 1, define it +to be the enumeration of all `UInt128` values, then take operation 2 to select +out specifically those that are hash states that will result in collisions. +But I'm supposing this is not happening. If you are running an implementation +of a dataview in memory that you're supposing is malicious, you probably have +bigger problems than someone inducing collisions. \ No newline at end of file diff --git a/src/Microsoft.ML.Core/EntryPoints/ModuleCatalog.cs b/src/Microsoft.ML.Core/EntryPoints/ModuleCatalog.cs index 3a468ad451..586f6a4b02 100644 --- a/src/Microsoft.ML.Core/EntryPoints/ModuleCatalog.cs +++ b/src/Microsoft.ML.Core/EntryPoints/ModuleCatalog.cs @@ -49,8 +49,10 @@ public sealed class EntryPointInfo public readonly Type OutputType; public readonly Type[] InputKinds; public readonly Type[] OutputKinds; + public readonly ObsoleteAttribute ObsoleteAttribute; - internal EntryPointInfo(IExceptionContext ectx, MethodInfo method, TlcModule.EntryPointAttribute attribute) + internal EntryPointInfo(IExceptionContext ectx, MethodInfo method, + TlcModule.EntryPointAttribute attribute, ObsoleteAttribute obsoleteAttribute) { Contracts.AssertValueOrNull(ectx); ectx.AssertValue(method); @@ -61,6 +63,7 @@ internal EntryPointInfo(IExceptionContext ectx, MethodInfo method, TlcModule.Ent Method = method; ShortName = attribute.ShortName; FriendlyName = attribute.UserName; + ObsoleteAttribute = obsoleteAttribute; // There are supposed to be 2 parameters, env and input for non-macro nodes. // Macro nodes have a 3rd parameter, the entry point node. @@ -183,7 +186,10 @@ private ModuleCatalog(IExceptionContext ectx) var attr = methodInfo.GetCustomAttributes(typeof(TlcModule.EntryPointAttribute), false).FirstOrDefault() as TlcModule.EntryPointAttribute; if (attr == null) continue; - var info = new EntryPointInfo(ectx, methodInfo, attr); + + var info = new EntryPointInfo(ectx, methodInfo, attr, + methodInfo.GetCustomAttributes(typeof(ObsoleteAttribute), false).FirstOrDefault() as ObsoleteAttribute); + entryPoints.Add(info); if (_entryPointMap.ContainsKey(info.Name)) { diff --git a/src/Microsoft.ML.Core/Utilities/MathUtils.cs b/src/Microsoft.ML.Core/Utilities/MathUtils.cs index e2848ea25d..8106ff5a2c 100644 --- a/src/Microsoft.ML.Core/Utilities/MathUtils.cs +++ b/src/Microsoft.ML.Core/Utilities/MathUtils.cs @@ -871,5 +871,16 @@ public static double Cos(double a) var res = Math.Cos(a); return Math.Abs(res) > 1 ? double.NaN : res; } + + /// + /// Returns the smallest integral value that is greater than or equal to the result of the division. + /// + /// Number to be divided. + /// Number with which to divide the numerator. + /// + public static long DivisionCeiling(long numerator, long denomenator) + { + return (checked(numerator + denomenator) - 1) / denomenator; + } } } diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index 3867b18f26..3678c749ba 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -85,15 +85,19 @@ private bool TryParse(string str) return TryParseSource(rgstr[istr++]); } - private bool TryParseSource(string str) + private bool TryParseSource(string str) => TryParseSourceEx(str, out Source); + + public static bool TryParseSourceEx(string str, out Range[] ranges) { + ranges = null; var strs = str.Split(','); if (str.Length == 0) return false; - Source = new Range[strs.Length]; + + ranges = new Range[strs.Length]; for (int i = 0; i < strs.Length; i++) { - if ((Source[i] = Range.Parse(strs[i])) == null) + if ((ranges[i] = Range.Parse(strs[i])) == null) return false; } return true; @@ -294,9 +298,12 @@ public class ArgumentsCore ShortName = "size")] public int? InputSize; - [Argument(ArgumentType.AtMostOnce, HelpText = "Source column separator. Options: tab, space, comma, single character", ShortName = "sep")] + [Argument(ArgumentType.AtMostOnce, Visibility = ArgumentAttribute.VisibilityType.CmdLineOnly, HelpText = "Source column separator. Options: tab, space, comma, single character", ShortName = "sep")] public string Separator = "tab"; + [Argument(ArgumentType.AtMostOnce, Name = nameof(Separator), Visibility = ArgumentAttribute.VisibilityType.EntryPointsOnly, HelpText = "Source column separator.", ShortName = "sep")] + public char[] SeparatorChars = new[] { '\t' }; + [Argument(ArgumentType.Multiple, HelpText = "Column groups. Each group is specified as name:type:numeric-ranges, eg, col=Features:R4:1-17,26,35-40", ShortName = "col", SortOrder = 1)] public Column[] Column; @@ -1005,26 +1012,40 @@ public TextLoader(IHostEnvironment env, Arguments args, IMultiStreamSource files _inputSize = SrcLim - 1; _host.CheckNonEmpty(args.Separator, nameof(args.Separator), "Must specify a separator"); - string sep = args.Separator.ToLowerInvariant(); - if (sep == ",") - _separators = new char[] { ',' }; - else + //Default arg.Separator is tab and default args.SeparatorChars is also a '\t'. + //At a time only one default can be different and whichever is different that will + //be used. + if (args.SeparatorChars.Length > 1 || args.SeparatorChars[0] != '\t') { var separators = new HashSet(); - foreach (string s in sep.Split(',')) - { - if (string.IsNullOrEmpty(s)) - continue; + foreach (char c in args.SeparatorChars) + separators.Add(NormalizeSeparator(c.ToString())); - char c = NormalizeSeparator(s); - separators.Add(c); - } _separators = separators.ToArray(); - - // Handling ",,,," case, that .Split() returns empty strings. - if (_separators.Length == 0) + } + else + { + string sep = args.Separator.ToLowerInvariant(); + if (sep == ",") _separators = new char[] { ',' }; + else + { + var separators = new HashSet(); + foreach (string s in sep.Split(',')) + { + if (string.IsNullOrEmpty(s)) + continue; + + char c = NormalizeSeparator(s); + separators.Add(c); + } + _separators = separators.ToArray(); + + // Handling ",,,," case, that .Split() returns empty strings. + if (_separators.Length == 0) + _separators = new char[] { ',' }; + } } _bindings = new Bindings(this, cols, headerFile); diff --git a/src/Microsoft.ML.Data/EntryPoints/InputBuilder.cs b/src/Microsoft.ML.Data/EntryPoints/InputBuilder.cs index 337aff3c14..3ec23a01bf 100644 --- a/src/Microsoft.ML.Data/EntryPoints/InputBuilder.cs +++ b/src/Microsoft.ML.Data/EntryPoints/InputBuilder.cs @@ -832,21 +832,21 @@ public static class SweepableDiscreteParam public static class PipelineSweeperSupportedMetrics { public new static string ToString() => "SupportedMetric"; - public const string Auc = "Auc"; + public const string Auc = "AUC"; public const string AccuracyMicro = "AccuracyMicro"; public const string AccuracyMacro = "AccuracyMacro"; public const string F1 = "F1"; - public const string AuPrc = "AuPrc"; + public const string AuPrc = "AUPRC"; public const string TopKAccuracy = "TopKAccuracy"; public const string L1 = "L1"; public const string L2 = "L2"; - public const string Rms = "Rms"; + public const string Rms = "RMS"; public const string LossFn = "LossFn"; public const string RSquared = "RSquared"; public const string LogLoss = "LogLoss"; public const string LogLossReduction = "LogLossReduction"; - public const string Ndcg = "Ndcg"; - public const string Dcg = "Dcg"; + public const string Ndcg = "NDCG"; + public const string Dcg = "DCG"; public const string PositivePrecision = "PositivePrecision"; public const string PositiveRecall = "PositiveRecall"; public const string NegativePrecision = "NegativePrecision"; @@ -858,9 +858,9 @@ public static class PipelineSweeperSupportedMetrics public const string ThreshAtK = "ThreshAtK"; public const string ThreshAtP = "ThreshAtP"; public const string ThreshAtNumPos = "ThreshAtNumPos"; - public const string Nmi = "Nmi"; + public const string Nmi = "NMI"; public const string AvgMinScore = "AvgMinScore"; - public const string Dbi = "Dbi"; + public const string Dbi = "DBI"; } } } diff --git a/src/Microsoft.ML.Data/Transforms/TermTransform.md b/src/Microsoft.ML.Data/Transforms/TermTransform.md new file mode 100644 index 0000000000..d245fda91b --- /dev/null +++ b/src/Microsoft.ML.Data/Transforms/TermTransform.md @@ -0,0 +1,41 @@ +# `TermTransform` Architecture + +The term transform takes one or more input columns, and builds a map mapping +observed values into a key type, with various options. This requires first +that we build a map given observed data, and then later have a means of +applying that map to new data. There are four helper classes of objects to +perform this task. We describe them here. + +* `Builder` instances can have different behavior depending on the item type + of the input, and whether we are sorting the input. They have mutable state. + Crucially they work over only primitive types, and are not aware of whether + the input data is vector or scalar. As their name implies they are stateful + objects. + +* `Trainer` objects wrap a builder, and have different implementations + depending on whether their input is vector or scalar. They are also + responsible for making sure the number of values accumulated does not exceed + the max terms limit. During the term transform's training, these objects are + constructed given a row on a particular column, and during training a method + is called to process that row. + +The above two classes of objects will be created and in existence only when +the transform is being trained, that is, in the non-deserializing constructor, +and will not be persisted beyond that point. + +* `TermMap` objects are created from builder objects, and are the final term + map. These are sort of the frozen immutable cousins of builders. Like + builders they work over primitive types. These objects are the ones + responsible for serialization and deserialization to the model stream and + other informational streams, construction of the per-item value mapper + delegates, and accessors for the term values used in constructing the + metadata (though they do not handle the actual metadata functions + themselves). Crucially, these objects can be shared among multiple term + transforms or multiple columns, and are not associated themselves with a + particular input dataview or column per se. + +* `BoundTermMap` objects are bound to a particular dataview, and a particular + column. They are responsible for the polymorphism depending on whether the + column they're mapping is vector or scalar, the creation of the metadata + accessors, and the creation of the actual getters (though, of course, they + rely on the term map to do this). diff --git a/src/Microsoft.ML.Parquet/ParquetLoader.cs b/src/Microsoft.ML.Parquet/ParquetLoader.cs index f0ecde34dc..21271f6e5c 100644 --- a/src/Microsoft.ML.Parquet/ParquetLoader.cs +++ b/src/Microsoft.ML.Parquet/ParquetLoader.cs @@ -94,7 +94,7 @@ public sealed class Arguments private readonly int _columnChunkReadSize; private readonly Column[] _columnsLoaded; private readonly DataSet _schemaDataSet; - private const int _defaultColumnChunkReadSize = 100; // Should ideally be close to Rowgroup size + private const int _defaultColumnChunkReadSize = 1000000; private bool _disposed; @@ -368,8 +368,8 @@ private sealed class Cursor : RootCursorBase, IRowCursor private readonly Delegate[] _getters; private readonly ReaderOptions _readerOptions; private int _curDataSetRow; - private IEnumerator _dataSetEnumerator; - private IEnumerator _blockEnumerator; + private IEnumerator _dataSetEnumerator; + private IEnumerator _blockEnumerator; private IList[] _columnValues; private IRandom _rand; @@ -390,11 +390,18 @@ public Cursor(ParquetLoader parent, Func predicate, IRandom rand) Columns = _loader._columnsLoaded.Select(i => i.Name).ToArray() }; - int numBlocks = (int)Math.Ceiling(((decimal)parent.GetRowCount() / _readerOptions.Count)); - int[] blockOrder = _rand == null ? Utils.GetIdentityPermutation(numBlocks) : Utils.GetRandomPermutation(rand, numBlocks); + // The number of blocks is calculated based on the specified rows in a block (defaults to 1M). + // Since we want to shuffle the blocks in addition to shuffling the rows in each block, checks + // are put in place to ensure we can produce a shuffle order for the blocks. + var numBlocks = MathUtils.DivisionCeiling((long)parent.GetRowCount(), _readerOptions.Count); + if (numBlocks > int.MaxValue) + { + throw _loader._host.ExceptParam(nameof(Arguments.ColumnChunkReadSize), "Error due to too many blocks. Try increasing block size."); + } + var blockOrder = CreateOrderSequence((int)numBlocks); _blockEnumerator = blockOrder.GetEnumerator(); - _dataSetEnumerator = new int[0].GetEnumerator(); // Initialize an empty enumerator to get started + _dataSetEnumerator = Enumerable.Empty().GetEnumerator(); _columnValues = new IList[_actives.Length]; _getters = new Delegate[_actives.Length]; for (int i = 0; i < _actives.Length; ++i) @@ -472,12 +479,12 @@ protected override bool MoveNextCore() { if (_dataSetEnumerator.MoveNext()) { - _curDataSetRow = (int)_dataSetEnumerator.Current; + _curDataSetRow = _dataSetEnumerator.Current; return true; } else if (_blockEnumerator.MoveNext()) { - _readerOptions.Offset = (int)_blockEnumerator.Current * _readerOptions.Count; + _readerOptions.Offset = (long)_blockEnumerator.Current * _readerOptions.Count; // When current dataset runs out, read the next portion of the parquet file. DataSet ds; @@ -486,9 +493,9 @@ protected override bool MoveNextCore() ds = ParquetReader.Read(_loader._parquetStream, _loader._parquetOptions, _readerOptions); } - int[] dataSetOrder = _rand == null ? Utils.GetIdentityPermutation(ds.RowCount) : Utils.GetRandomPermutation(_rand, ds.RowCount); + var dataSetOrder = CreateOrderSequence(ds.RowCount); _dataSetEnumerator = dataSetOrder.GetEnumerator(); - _curDataSetRow = dataSetOrder[0]; + _curDataSetRow = dataSetOrder.ElementAt(0); // Cache list for each active column for (int i = 0; i < _actives.Length; i++) @@ -533,6 +540,26 @@ public bool IsColumnActive(int col) Ch.CheckParam(0 <= col && col < _colToActivesIndex.Length, nameof(col)); return _colToActivesIndex[col] >= 0; } + + /// + /// Creates a in-order or shuffled sequence, based on whether _rand is specified. + /// If unable to create a shuffle sequence, will default to sequential. + /// + /// Number of elements in the sequence. + /// + private IEnumerable CreateOrderSequence(int size) + { + IEnumerable order; + try + { + order = _rand == null ? Enumerable.Range(0, size) : Utils.GetRandomPermutation(_rand, size); + } + catch (OutOfMemoryException) + { + order = Enumerable.Range(0, size); + } + return order; + } } #region Dispose @@ -671,4 +698,4 @@ private string ConvertListToString(IList list) } } } -} +} \ No newline at end of file diff --git a/src/Microsoft.ML.PipelineInference/AutoInference.cs b/src/Microsoft.ML.PipelineInference/AutoInference.cs index 894029460a..642ff4d0d7 100644 --- a/src/Microsoft.ML.PipelineInference/AutoInference.cs +++ b/src/Microsoft.ML.PipelineInference/AutoInference.cs @@ -158,7 +158,8 @@ private bool GetDataVariableName(IExceptionContext ectx, string nameOfData, JTok return false; string dataVar = firstNodeInputs.Value(nameOfData); - ectx.Check(VariableBinding.IsValidVariableName(ectx, dataVar), $"Invalid variable name {dataVar}."); + if (!VariableBinding.IsValidVariableName(ectx, dataVar)) + throw ectx.ExceptParam(nameof(nameOfData), $"Invalid variable name {dataVar}."); variableName = dataVar.Substring(1); return true; @@ -172,12 +173,14 @@ private bool GetDataVariableName(IExceptionContext ectx, string nameOfData, JTok public sealed class RunSummary { public double MetricValue { get; } + public double TrainingMetricValue { get; } public int NumRowsInTraining { get; } public long RunTimeMilliseconds { get; } - public RunSummary(double metricValue, int numRows, long runTimeMilliseconds) + public RunSummary(double metricValue, int numRows, long runTimeMilliseconds, double trainingMetricValue) { MetricValue = metricValue; + TrainingMetricValue = trainingMetricValue; NumRowsInTraining = numRows; RunTimeMilliseconds = runTimeMilliseconds; } @@ -303,7 +306,7 @@ private void MainLearningLoop(int batchSize, int numOfTrainingRows) var stopwatch = new Stopwatch(); var probabilityUtils = new Sweeper.Algorithms.SweeperProbabilityUtils(_host); - while (!_terminator.ShouldTerminate(_history)) + while (!_terminator.ShouldTerminate(_history)) { // Get next set of candidates var currentBatchSize = batchSize; @@ -341,16 +344,17 @@ private void ProcessPipeline(Sweeper.Algorithms.SweeperProbabilityUtils utils, S // Run pipeline, and time how long it takes stopwatch.Restart(); - double d = candidate.RunTrainTestExperiment(_trainData.Take(randomizedNumberOfRows), - _testData, Metric, TrainerKind); + candidate.RunTrainTestExperiment(_trainData.Take(randomizedNumberOfRows), + _testData, Metric, TrainerKind, out var testMetricVal, out var trainMetricVal); stopwatch.Stop(); // Handle key collisions on sorted list - while (_sortedSampledElements.ContainsKey(d)) - d += 1e-10; + while (_sortedSampledElements.ContainsKey(testMetricVal)) + testMetricVal += 1e-10; // Save performance score - candidate.PerformanceSummary = new RunSummary(d, randomizedNumberOfRows, stopwatch.ElapsedMilliseconds); + candidate.PerformanceSummary = + new RunSummary(testMetricVal, randomizedNumberOfRows, stopwatch.ElapsedMilliseconds, trainMetricVal); _sortedSampledElements.Add(candidate.PerformanceSummary.MetricValue, candidate); _history.Add(candidate); } @@ -579,11 +583,13 @@ public static AutoMlMlState InferPipelines(IHostEnvironment env, PipelineOptimiz RecipeInference.InferRecipesFromData(env, trainDataPath, schemaDefinitionFile, out var _, out schemaDefinition, out var _, true); +#pragma warning disable 0618 var data = ImportTextData.ImportText(env, new ImportTextData.Input { InputFile = new SimpleFileHandle(env, trainDataPath, false, false), CustomSchema = schemaDefinition }).Data; +#pragma warning restore 0618 var splitOutput = TrainTestSplit.Split(env, new TrainTestSplit.Input { Data = data, Fraction = 0.8f }); AutoMlMlState amls = new AutoMlMlState(env, metric, autoMlEngine, terminator, trainerKind, splitOutput.TrainData.Take(numOfSampleRows), splitOutput.TestData.Take(numOfSampleRows)); diff --git a/src/Microsoft.ML.PipelineInference/AutoMlUtils.cs b/src/Microsoft.ML.PipelineInference/AutoMlUtils.cs index bd4de97b48..a0aae16a63 100644 --- a/src/Microsoft.ML.PipelineInference/AutoMlUtils.cs +++ b/src/Microsoft.ML.PipelineInference/AutoMlUtils.cs @@ -15,21 +15,34 @@ namespace Microsoft.ML.Runtime.PipelineInference { public static class AutoMlUtils { - public static AutoInference.RunSummary ExtractRunSummary(IHostEnvironment env, IDataView data, string metricColumnName) + public static double ExtractValueFromIDV(IHostEnvironment env, IDataView result, string columnName) { - double metricValue = 0; - int numRows = 0; - var schema = data.Schema; - schema.TryGetColumnIndex(metricColumnName, out var metricCol); + Contracts.CheckValue(env, nameof(env)); + env.CheckValue(result, nameof(result)); + env.CheckNonEmpty(columnName, nameof(columnName)); - using (var cursor = data.GetRowCursor(col => col == metricCol)) + double outputValue = 0; + var schema = result.Schema; + if (!schema.TryGetColumnIndex(columnName, out var metricCol)) + throw env.ExceptParam(nameof(columnName), $"Schema does not contain column: {columnName}"); + + using (var cursor = result.GetRowCursor(col => col == metricCol)) { var getter = cursor.GetGetter(metricCol); - cursor.MoveNext(); - getter(ref metricValue); + bool moved = cursor.MoveNext(); + env.Check(moved, "Expected an IDataView with a single row. Results dataset has no rows to extract."); + getter(ref outputValue); + env.Check(!cursor.MoveNext(), "Expected an IDataView with a single row. Results dataset has too many rows."); } - return new AutoInference.RunSummary(metricValue, numRows, 0); + return outputValue; + } + + public static AutoInference.RunSummary ExtractRunSummary(IHostEnvironment env, IDataView result, string metricColumnName, IDataView trainResult = null) + { + double testingMetricValue = ExtractValueFromIDV(env, result, metricColumnName); + double trainingMetricValue = trainResult != null ? ExtractValueFromIDV(env, trainResult, metricColumnName) : double.MinValue; + return new AutoInference.RunSummary(testingMetricValue, 0, 0, trainingMetricValue); } public static CommonInputs.IEvaluatorInput CloneEvaluatorInstance(CommonInputs.IEvaluatorInput evalInput) => @@ -618,5 +631,7 @@ public static Tuple[] ConvertToSweepArgumentStrings(TlcModule. } return results; } + + public static string GenerateOverallTrainingMetricVarName(Guid id) => $"Var_Training_OM_{id:N}"; } } diff --git a/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs b/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs index 58f44b9ce8..06c260a054 100644 --- a/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs +++ b/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs @@ -65,11 +65,14 @@ public static Output ExtractSweepResult(IHostEnvironment env, ResultInput input) var col1 = new KeyValuePair("Graph", TextType.Instance); var col2 = new KeyValuePair("MetricValue", PrimitiveType.FromKind(DataKind.R8)); var col3 = new KeyValuePair("PipelineId", TextType.Instance); + var col4 = new KeyValuePair("TrainingMetricValue", PrimitiveType.FromKind(DataKind.R8)); + var col5 = new KeyValuePair("FirstInput", TextType.Instance); + var col6 = new KeyValuePair("PredictorModel", TextType.Instance); if (rows.Count == 0) { var host = env.Register("ExtractSweepResult"); - outputView = new EmptyDataView(host, new SimpleSchema(host, col1, col2, col3)); + outputView = new EmptyDataView(host, new SimpleSchema(host, col1, col2, col3, col4, col5, col6)); } else { @@ -77,6 +80,9 @@ public static Output ExtractSweepResult(IHostEnvironment env, ResultInput input) builder.AddColumn(col1.Key, (PrimitiveType)col1.Value, rows.Select(r => new DvText(r.GraphJson)).ToArray()); builder.AddColumn(col2.Key, (PrimitiveType)col2.Value, rows.Select(r => r.MetricValue).ToArray()); builder.AddColumn(col3.Key, (PrimitiveType)col3.Value, rows.Select(r => new DvText(r.PipelineId)).ToArray()); + builder.AddColumn(col4.Key, (PrimitiveType)col4.Value, rows.Select(r => r.TrainingMetricValue).ToArray()); + builder.AddColumn(col5.Key, (PrimitiveType)col5.Value, rows.Select(r => new DvText(r.FirstInput)).ToArray()); + builder.AddColumn(col6.Key, (PrimitiveType)col6.Value, rows.Select(r => new DvText(r.PredictorModel)).ToArray()); outputView = builder.GetDataView(); } return new Output { Results = outputView, State = autoMlState }; @@ -132,11 +138,11 @@ public static CommonOutputs.MacroOutput PipelineSweep( // Extract performance summaries and assign to previous candidate pipelines. foreach (var pipeline in autoMlState.BatchCandidates) { - if (node.Context.TryGetVariable(ExperimentUtils.GenerateOverallMetricVarName(pipeline.UniqueId), - out var v)) + if (node.Context.TryGetVariable(ExperimentUtils.GenerateOverallMetricVarName(pipeline.UniqueId), out var v) && + node.Context.TryGetVariable(AutoMlUtils.GenerateOverallTrainingMetricVarName(pipeline.UniqueId), out var v2)) { pipeline.PerformanceSummary = - AutoMlUtils.ExtractRunSummary(env, (IDataView)v.Value, autoMlState.Metric.Name); + AutoMlUtils.ExtractRunSummary(env, (IDataView)v.Value, autoMlState.Metric.Name, (IDataView)v2.Value); autoMlState.AddEvaluated(pipeline); } } @@ -168,14 +174,17 @@ public static CommonOutputs.MacroOutput PipelineSweep( { // Add train test experiments to current graph for candidate pipeline var subgraph = new Experiment(env); - var trainTestOutput = p.AddAsTrainTest(training, testing, autoMlState.TrainerKind, subgraph); + var trainTestOutput = p.AddAsTrainTest(training, testing, autoMlState.TrainerKind, subgraph, true); // Change variable name to reference pipeline ID in output map, context and entrypoint output. var uniqueName = ExperimentUtils.GenerateOverallMetricVarName(p.UniqueId); + var uniqueNameTraining = AutoMlUtils.GenerateOverallTrainingMetricVarName(p.UniqueId); var sgNode = EntryPointNode.ValidateNodes(env, node.Context, new JArray(subgraph.GetNodes().Last()), node.Catalog).Last(); sgNode.RenameOutputVariable(trainTestOutput.OverallMetrics.VarName, uniqueName, cascadeChanges: true); + sgNode.RenameOutputVariable(trainTestOutput.TrainingOverallMetrics.VarName, uniqueNameTraining, cascadeChanges: true); trainTestOutput.OverallMetrics.VarName = uniqueName; + trainTestOutput.TrainingOverallMetrics.VarName = uniqueNameTraining; expNodes.Add(sgNode); // Store indicators, to pass to next iteration of macro. diff --git a/src/Microsoft.ML.PipelineInference/Microsoft.ML.PipelineInference.csproj b/src/Microsoft.ML.PipelineInference/Microsoft.ML.PipelineInference.csproj index 7cf9585f3b..ab3e464c74 100644 --- a/src/Microsoft.ML.PipelineInference/Microsoft.ML.PipelineInference.csproj +++ b/src/Microsoft.ML.PipelineInference/Microsoft.ML.PipelineInference.csproj @@ -17,6 +17,7 @@ + diff --git a/src/Microsoft.ML.PipelineInference/PipelinePattern.cs b/src/Microsoft.ML.PipelineInference/PipelinePattern.cs index 21287742a0..662a16798f 100644 --- a/src/Microsoft.ML.PipelineInference/PipelinePattern.cs +++ b/src/Microsoft.ML.PipelineInference/PipelinePattern.cs @@ -17,20 +17,38 @@ namespace Microsoft.ML.Runtime.PipelineInference /// public sealed class PipelinePattern : IEquatable { + /// + /// Class for encapsulating the information returned in the output IDataView for a pipeline + /// that has been run through the TrainTest macro. + /// public sealed class PipelineResultRow { public string GraphJson { get; } + /// + /// The metric value of the test dataset result (always needed). + /// public double MetricValue { get; } + /// + /// The metric value of the training dataset result (not always used or set). + /// + public double TrainingMetricValue { get; } public string PipelineId { get; } + public string FirstInput { get; } + public string PredictorModel { get; } public PipelineResultRow() { } - public PipelineResultRow(string graphJson, double metricValue, string pipelineId) + public PipelineResultRow(string graphJson, double metricValue, + string pipelineId, double trainingMetricValue, string firstInput, + string predictorModel) { GraphJson = graphJson; MetricValue = metricValue; PipelineId = pipelineId; + TrainingMetricValue = trainingMetricValue; + FirstInput = firstInput; + PredictorModel = predictorModel; } } @@ -111,7 +129,8 @@ public AutoInference.EntryPointGraphDef ToEntryPointGraph(Experiment experiment public bool Equals(PipelinePattern obj) => obj != null && UniqueId == obj.UniqueId; // REVIEW: We may want to allow for sweeping with CV in the future, so we will need to add new methods like this, or refactor these in that case. - public Experiment CreateTrainTestExperiment(IDataView trainData, IDataView testData, MacroUtils.TrainerKinds trainerKind, out Models.TrainTestEvaluator.Output resultsOutput) + public Experiment CreateTrainTestExperiment(IDataView trainData, IDataView testData, MacroUtils.TrainerKinds trainerKind, + bool includeTrainingMetrics, out Models.TrainTestEvaluator.Output resultsOutput) { var graphDef = ToEntryPointGraph(); var subGraph = graphDef.Graph; @@ -136,7 +155,8 @@ public Experiment CreateTrainTestExperiment(IDataView trainData, IDataView testD Model = finalOutput }, PipelineId = UniqueId.ToString("N"), - Kind = MacroUtils.TrainerKindApiValue(trainerKind) + Kind = MacroUtils.TrainerKindApiValue(trainerKind), + IncludeTrainingMetrics = includeTrainingMetrics }; var experiment = _env.CreateExperiment(); @@ -150,7 +170,7 @@ public Experiment CreateTrainTestExperiment(IDataView trainData, IDataView testD } public Models.TrainTestEvaluator.Output AddAsTrainTest(Var trainData, Var testData, - MacroUtils.TrainerKinds trainerKind, Experiment experiment = null) + MacroUtils.TrainerKinds trainerKind, Experiment experiment = null, bool includeTrainingMetrics = false) { experiment = experiment ?? _env.CreateExperiment(); var graphDef = ToEntryPointGraph(experiment); @@ -174,7 +194,8 @@ public Models.TrainTestEvaluator.Output AddAsTrainTest(Var trainData, TrainingData = trainData, TestingData = testData, Kind = MacroUtils.TrainerKindApiValue(trainerKind), - PipelineId = UniqueId.ToString("N") + PipelineId = UniqueId.ToString("N"), + IncludeTrainingMetrics = includeTrainingMetrics }; var trainTestOutput = experiment.Add(trainTestInput); return trainTestOutput; @@ -183,57 +204,80 @@ public Models.TrainTestEvaluator.Output AddAsTrainTest(Var trainData, /// /// Runs a train-test experiment on the current pipeline, through entrypoints. /// - public double RunTrainTestExperiment(IDataView trainData, IDataView testData, AutoInference.SupportedMetric metric, MacroUtils.TrainerKinds trainerKind) + public void RunTrainTestExperiment(IDataView trainData, IDataView testData, + AutoInference.SupportedMetric metric, MacroUtils.TrainerKinds trainerKind, out double testMetricValue, + out double trainMetricValue) { - var experiment = CreateTrainTestExperiment(trainData, testData, trainerKind, out var trainTestOutput); + var experiment = CreateTrainTestExperiment(trainData, testData, trainerKind, true, out var trainTestOutput); experiment.Run(); - var dataOut = experiment.GetOutput(trainTestOutput.OverallMetrics); - var schema = dataOut.Schema; - schema.TryGetColumnIndex(metric.Name, out var metricCol); - using (var cursor = dataOut.GetRowCursor(col => col == metricCol)) - { - var getter = cursor.GetGetter(metricCol); - double metricValue = 0; - cursor.MoveNext(); - getter(ref metricValue); - return metricValue; - } + var dataOut = experiment.GetOutput(trainTestOutput.OverallMetrics); + var dataOutTraining = experiment.GetOutput(trainTestOutput.TrainingOverallMetrics); + testMetricValue = AutoMlUtils.ExtractValueFromIDV(_env, dataOut, metric.Name); + trainMetricValue = AutoMlUtils.ExtractValueFromIDV(_env, dataOutTraining, metric.Name); } - public static PipelineResultRow[] ExtractResults(IHostEnvironment env, IDataView data, string graphColName, string metricColName, string idColName) + public static PipelineResultRow[] ExtractResults(IHostEnvironment env, IDataView data, + string graphColName, string metricColName, string idColName, string trainingMetricColName, + string firstInputColName, string predictorModelColName) { var results = new List(); var schema = data.Schema; if (!schema.TryGetColumnIndex(graphColName, out var graphCol)) - throw env.ExceptNotSupp($"Column name {graphColName} not found"); + throw env.ExceptParam(nameof(graphColName), $"Column name {graphColName} not found"); if (!schema.TryGetColumnIndex(metricColName, out var metricCol)) - throw env.ExceptNotSupp($"Column name {metricColName} not found"); + throw env.ExceptParam(nameof(metricColName), $"Column name {metricColName} not found"); + if (!schema.TryGetColumnIndex(trainingMetricColName, out var trainingMetricCol)) + throw env.ExceptParam(nameof(trainingMetricColName), $"Column name {trainingMetricColName} not found"); if (!schema.TryGetColumnIndex(idColName, out var pipelineIdCol)) - throw env.ExceptNotSupp($"Column name {idColName} not found"); + throw env.ExceptParam(nameof(idColName), $"Column name {idColName} not found"); + if (!schema.TryGetColumnIndex(firstInputColName, out var firstInputCol)) + throw env.ExceptParam(nameof(firstInputColName), $"Column name {firstInputColName} not found"); + if (!schema.TryGetColumnIndex(predictorModelColName, out var predictorModelCol)) + throw env.ExceptParam(nameof(predictorModelColName), $"Column name {predictorModelColName} not found"); using (var cursor = data.GetRowCursor(col => true)) { + var getter1 = cursor.GetGetter(metricCol); + var getter2 = cursor.GetGetter(graphCol); + var getter3 = cursor.GetGetter(pipelineIdCol); + var getter4 = cursor.GetGetter(trainingMetricCol); + var getter5 = cursor.GetGetter(firstInputCol); + var getter6 = cursor.GetGetter(predictorModelCol); + double metricValue = 0; + double trainingMetricValue = 0; + DvText graphJson = new DvText(); + DvText pipelineId = new DvText(); + DvText firstInput = new DvText(); + DvText predictorModel = new DvText(); + while (cursor.MoveNext()) { - var getter1 = cursor.GetGetter(metricCol); - double metricValue = 0; getter1(ref metricValue); - var getter2 = cursor.GetGetter(graphCol); - DvText graphJson = new DvText(); getter2(ref graphJson); - var getter3 = cursor.GetGetter(pipelineIdCol); - DvText pipelineId = new DvText(); getter3(ref pipelineId); - results.Add(new PipelineResultRow(graphJson.ToString(), metricValue, pipelineId.ToString())); + getter4(ref trainingMetricValue); + getter5(ref firstInput); + getter6(ref predictorModel); + + results.Add(new PipelineResultRow(graphJson.ToString(), + metricValue, pipelineId.ToString(), trainingMetricValue, + firstInput.ToString(), predictorModel.ToString())); } } return results.ToArray(); } - public PipelineResultRow ToResultRow() => - new PipelineResultRow(ToEntryPointGraph().Graph.ToJsonString(), - PerformanceSummary?.MetricValue ?? -1d, UniqueId.ToString("N")); + public PipelineResultRow ToResultRow() + { + var graphDef = ToEntryPointGraph(); + + return new PipelineResultRow($"{{'Nodes' : [{graphDef.Graph.ToJsonString()}]}}", + PerformanceSummary?.MetricValue ?? -1d, UniqueId.ToString("N"), + PerformanceSummary?.TrainingMetricValue ?? -1d, + graphDef.GetSubgraphFirstNodeDataVarName(_env), + graphDef.ModelOutput.VarName); + } } } diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs index 2ca1af7159..317ee98db0 100644 --- a/src/Microsoft.ML/CSharpApi.cs +++ b/src/Microsoft.ML/CSharpApi.cs @@ -22,6 +22,30 @@ namespace Runtime { public sealed partial class Experiment { + public Microsoft.ML.Data.CustomTextLoader.Output Add(Microsoft.ML.Data.CustomTextLoader input) + { + var output = new Microsoft.ML.Data.CustomTextLoader.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Data.CustomTextLoader input, Microsoft.ML.Data.CustomTextLoader.Output output) + { + _jsonNodes.Add(Serialize("Data.CustomTextLoader", input, output)); + } + + public Microsoft.ML.Data.DataViewReference.Output Add(Microsoft.ML.Data.DataViewReference input) + { + var output = new Microsoft.ML.Data.DataViewReference.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Data.DataViewReference input, Microsoft.ML.Data.DataViewReference.Output output) + { + _jsonNodes.Add(Serialize("Data.DataViewReference", input, output)); + } + public Microsoft.ML.Data.IDataViewArrayConverter.Output Add(Microsoft.ML.Data.IDataViewArrayConverter input) { var output = new Microsoft.ML.Data.IDataViewArrayConverter.Output(); @@ -53,22 +77,11 @@ public Microsoft.ML.Data.TextLoader.Output Add(Microsoft.ML.Data.TextLoader inpu return output; } - public Microsoft.ML.Data.DataViewReference.Output Add(Microsoft.ML.Data.DataViewReference input) - { - var output = new Microsoft.ML.Data.DataViewReference.Output(); - Add(input, output); - return output; - } - public void Add(Microsoft.ML.Data.TextLoader input, Microsoft.ML.Data.TextLoader.Output output) { _jsonNodes.Add(Serialize("Data.TextLoader", input, output)); } - public void Add(Microsoft.ML.Data.DataViewReference input, Microsoft.ML.Data.DataViewReference.Output output) - { - _jsonNodes.Add(Serialize("Data.DataViewReference", input, output)); - } public Microsoft.ML.Models.AnomalyDetectionEvaluator.Output Add(Microsoft.ML.Models.AnomalyDetectionEvaluator input) { var output = new Microsoft.ML.Models.AnomalyDetectionEvaluator.Output(); @@ -453,6 +466,18 @@ public void Add(Microsoft.ML.Trainers.GeneralizedAdditiveModelRegressor input, M _jsonNodes.Add(Serialize("Trainers.GeneralizedAdditiveModelRegressor", input, output)); } + public Microsoft.ML.Trainers.KMeansPlusPlusClusterer.Output Add(Microsoft.ML.Trainers.KMeansPlusPlusClusterer input) + { + var output = new Microsoft.ML.Trainers.KMeansPlusPlusClusterer.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Trainers.KMeansPlusPlusClusterer input, Microsoft.ML.Trainers.KMeansPlusPlusClusterer.Output output) + { + _jsonNodes.Add(Serialize("Trainers.KMeansPlusPlusClusterer", input, output)); + } + public Microsoft.ML.Trainers.LinearSvmBinaryClassifier.Output Add(Microsoft.ML.Trainers.LinearSvmBinaryClassifier input) { var output = new Microsoft.ML.Trainers.LinearSvmBinaryClassifier.Output(); @@ -1271,6 +1296,66 @@ public void Add(Microsoft.ML.Transforms.WordTokenizer input, Microsoft.ML.Transf } } + namespace Data + { + + /// + /// Import a dataset from a text file + /// + [Obsolete("Use TextLoader instead.")] + public sealed partial class CustomTextLoader + { + + + /// + /// Location of the input file + /// + public Var InputFile { get; set; } = new Var(); + + /// + /// Custom schema to use for parsing + /// + public string CustomSchema { get; set; } + + + public sealed class Output + { + /// + /// The resulting data view + /// + public Var Data { get; set; } = new Var(); + + } + } + } + + namespace Data + { + + /// + /// Pass dataview from memory to experiment + /// + public sealed partial class DataViewReference + { + + + /// + /// Pointer to IDataView in memory + /// + public Var Data { get; set; } = new Var(); + + + public sealed class Output + { + /// + /// The resulting data view + /// + public Var Data { get; set; } = new Var(); + + } + } + } + namespace Data { @@ -1328,40 +1413,185 @@ public sealed class Output namespace Data { - /// - /// Import a dataset from a text file - /// - public sealed partial class TextLoader + public sealed partial class TextLoaderArguments { + /// + /// Use separate parsing threads? + /// + public bool UseThreads { get; set; } = true; + /// + /// File containing a header with feature names. If specified, header defined in the data file (header+) is ignored. + /// + public string HeaderFile { get; set; } /// - /// Location of the input file + /// Maximum number of rows to produce /// - public Var InputFile { get; set; } = new Var(); + public long? MaxRows { get; set; } /// - /// Custom schema to use for parsing + /// Whether the input may include quoted values, which can contain separator characters, colons, and distinguish empty values from missing values. When true, consecutive separators denote a missing value and an empty value is denoted by "". When false, consecutive separators denote an empty value. /// - public string CustomSchema { get; set; } + public bool AllowQuoting { get; set; } = true; + + /// + /// Whether the input may include sparse representations + /// + public bool AllowSparse { get; set; } = true; + /// + /// Number of source columns in the text data. Default is that sparse rows contain their size information. + /// + public int? InputSize { get; set; } - public sealed class Output - { - /// - /// The resulting data view - /// - public Var Data { get; set; } = new Var(); + /// + /// Source column separator. + /// + public char[] Separator { get; set; } = { '\t' }; + + /// + /// Column groups. Each group is specified as name:type:numeric-ranges, eg, col=Features:R4:1-17,26,35-40 + /// + public TextLoaderColumn[] Column { get; set; } + + /// + /// Remove trailing whitespace from lines + /// + public bool TrimWhitespace { get; set; } = false; + + /// + /// Data file has header with feature names. Header is read only if options 'hs' and 'hf' are not specified. + /// + public bool HasHeader { get; set; } = false; - } } - public sealed partial class DataViewReference + public sealed partial class TextLoaderColumn { + /// + /// Name of the column + /// + public string Name { get; set; } + + /// + /// Type of the items in the column + /// + public DataKind? Type { get; set; } + + /// + /// Source index range(s) of the column + /// + public TextLoaderRange[] Source { get; set; } + + /// + /// For a key column, this defines the range of values + /// + public KeyRange KeyRange { get; set; } + + } + + public sealed partial class TextLoaderRange + { + /// + /// First index in the range + /// + public int Min { get; set; } + + /// + /// Last index in the range + /// + public int? Max { get; set; } + + /// + /// This range extends to the end of the line, but should be a fixed number of items + /// + public bool AutoEnd { get; set; } = false; + + /// + /// This range extends to the end of the line, which can vary from line to line + /// + public bool VariableEnd { get; set; } = false; + + /// + /// This range includes only other indices not specified + /// + public bool AllOther { get; set; } = false; + + /// + /// Force scalar columns to be treated as vectors of length one + /// + public bool ForceVector { get; set; } = false; + + } + + public sealed partial class KeyRange + { + /// + /// First index in the range + /// + public ulong Min { get; set; } = 0; + + /// + /// Last index in the range + /// + public ulong? Max { get; set; } + + /// + /// Whether the key is contiguous + /// + public bool Contiguous { get; set; } = true; + + } + + /// + /// Import a dataset from a text file + /// + public sealed partial class TextLoader : Microsoft.ML.ILearningPipelineLoader + { + + [JsonIgnore] + private string _inputFilePath = null; + public TextLoader(string filePath) + { + _inputFilePath = filePath; + } + + public void SetInput(IHostEnvironment env, Experiment experiment) + { + IFileHandle inputFile = new SimpleFileHandle(env, _inputFilePath, false, false); + experiment.SetInput(InputFile, inputFile); + } + + public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + { + Contracts.Assert(previousStep == null); + + return new TextLoaderPipelineStep(experiment.Add(this)); + } + + private class TextLoaderPipelineStep : ILearningPipelineDataStep + { + public TextLoaderPipelineStep (Output output) + { + Data = output.Data; + Model = null; + } + + public Var Data { get; } + public Var Model { get; } + } + /// /// Location of the input file /// - public Var Data { get; set; } = new Var(); + public Var InputFile { get; set; } = new Var(); + + /// + /// Arguments + /// + public Data.TextLoaderArguments Arguments { get; set; } = new Data.TextLoaderArguments(); + public sealed class Output { @@ -1561,7 +1791,7 @@ public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ICla namespace Models { - public sealed class CrossValidationBinaryMacroSubGraphInput + public sealed partial class CrossValidationBinaryMacroSubGraphInput { /// /// The data to be used for training @@ -1570,7 +1800,7 @@ public sealed class CrossValidationBinaryMacroSubGraphInput } - public sealed class CrossValidationBinaryMacroSubGraphOutput + public sealed partial class CrossValidationBinaryMacroSubGraphOutput { /// /// The model @@ -1826,7 +2056,7 @@ public enum MacroUtilsTrainerKinds } - public sealed class CrossValidationMacroSubGraphInput + public sealed partial class CrossValidationMacroSubGraphInput { /// /// The data to be used for training @@ -1835,7 +2065,7 @@ public sealed class CrossValidationMacroSubGraphInput } - public sealed class CrossValidationMacroSubGraphOutput + public sealed partial class CrossValidationMacroSubGraphOutput { /// /// The model @@ -2239,7 +2469,7 @@ public enum CachingOptions } - public sealed class OneVersusAllMacroSubGraphOutput + public sealed partial class OneVersusAllMacroSubGraphOutput { /// /// The predictor model for the subgraph exemplar. @@ -2877,7 +3107,7 @@ public sealed class Output namespace Models { - public sealed class TrainTestBinaryMacroSubGraphInput + public sealed partial class TrainTestBinaryMacroSubGraphInput { /// /// The data to be used for training @@ -2886,7 +3116,7 @@ public sealed class TrainTestBinaryMacroSubGraphInput } - public sealed class TrainTestBinaryMacroSubGraphOutput + public sealed partial class TrainTestBinaryMacroSubGraphOutput { /// /// The model @@ -2962,7 +3192,7 @@ public sealed class Output namespace Models { - public sealed class TrainTestMacroSubGraphInput + public sealed partial class TrainTestMacroSubGraphInput { /// /// The data to be used for training @@ -2971,7 +3201,7 @@ public sealed class TrainTestMacroSubGraphInput } - public sealed class TrainTestMacroSubGraphOutput + public sealed partial class TrainTestMacroSubGraphOutput { /// /// The model @@ -5686,6 +5916,107 @@ public GeneralizedAdditiveModelRegressorPipelineStep(Output output) } } + namespace Trainers + { + public enum KMeansPlusPlusTrainerInitAlgorithm + { + KMeansPlusPlus = 0, + Random = 1, + KMeansParallel = 2 + } + + + /// + /// K-means is a popular clustering algorithm. With K-means, the data is clustered into a specified number of clusters in order to minimize the within-cluster sum of squares. K-means++ improves upon K-means by using a better method for choosing the initial cluster centers. + /// + public sealed partial class KMeansPlusPlusClusterer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + { + + + /// + /// The number of clusters + /// + [TlcModule.SweepableDiscreteParamAttribute("K", new object[]{5, 10, 20, 40})] + public int K { get; set; } = 5; + + /// + /// Cluster initialization algorithm + /// + public Trainers.KMeansPlusPlusTrainerInitAlgorithm InitAlgorithm { get; set; } = Trainers.KMeansPlusPlusTrainerInitAlgorithm.KMeansParallel; + + /// + /// Tolerance parameter for trainer convergence. Lower = slower, more accurate + /// + public float OptTol { get; set; } = 1E-07f; + + /// + /// Maximum number of iterations. + /// + public int MaxIterations { get; set; } = 1000; + + /// + /// Memory budget (in MBs) to use for KMeans acceleration + /// + public int AccelMemBudgetMb { get; set; } = 4096; + + /// + /// Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed. + /// + public int? NumThreads { get; set; } + + /// + /// The data to be used for training + /// + public Var TrainingData { get; set; } = new Var(); + + /// + /// Column to use for features + /// + public string FeatureColumn { get; set; } = "Features"; + + /// + /// Normalize option for the feature column + /// + public Models.NormalizeOption NormalizeFeatures { get; set; } = Models.NormalizeOption.Auto; + + /// + /// Whether learner should cache input training data + /// + public Models.CachingOptions Caching { get; set; } = Models.CachingOptions.Auto; + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IClusteringOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + { + /// + /// The trained model + /// + public Var PredictorModel { get; set; } = new Var(); + + } + public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + { + if (!(previousStep is ILearningPipelineDataStep dataStep)) + { + throw new InvalidOperationException($"{ nameof(KMeansPlusPlusClusterer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + } + + TrainingData = dataStep.Data; + Output output = experiment.Add(this); + return new KMeansPlusPlusClustererPipelineStep(output); + } + + private class KMeansPlusPlusClustererPipelineStep : ILearningPipelinePredictorStep + { + public KMeansPlusPlusClustererPipelineStep(Output output) + { + Model = output.PredictorModel; + } + + public Var Model { get; } + } + } + } + namespace Trainers { @@ -7196,7 +7527,7 @@ public BinaryPredictionScoreColumnsRenamerPipelineStep(Output output) namespace Transforms { - public sealed class NormalizeTransformBinColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class NormalizeTransformBinColumn : OneToOneColumn, IOneToOneColumn { /// /// Max number of bins, power of 2 recommended @@ -7348,7 +7679,7 @@ public enum CategoricalTransformOutputKind : byte } - public sealed class CategoricalHashTransformColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class CategoricalHashTransformColumn : OneToOneColumn, IOneToOneColumn { /// /// The number of bits to hash into. Must be between 1 and 30, inclusive. @@ -7518,7 +7849,7 @@ public enum TermTransformSortOrder : byte } - public sealed class CategoricalTransformColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class CategoricalTransformColumn : OneToOneColumn, IOneToOneColumn { /// /// Output kind: Bag (multi-set vector), Ind (indicator vector), Key (index), or Binary encoded indicator vector @@ -7682,7 +8013,7 @@ public CategoricalOneHotVectorizerPipelineStep(Output output) namespace Transforms { - public sealed class CharTokenizeTransformColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class CharTokenizeTransformColumn : OneToOneColumn, IOneToOneColumn { /// /// Name of the new column @@ -7801,7 +8132,7 @@ public CharacterTokenizerPipelineStep(Output output) namespace Transforms { - public sealed class ConcatTransformColumn : ManyToOneColumn, IManyToOneColumn + public sealed partial class ConcatTransformColumn : ManyToOneColumn, IManyToOneColumn { /// /// Name of the new column @@ -7891,7 +8222,7 @@ public ColumnConcatenatorPipelineStep(Output output) namespace Transforms { - public sealed class CopyColumnsTransformColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class CopyColumnsTransformColumn : OneToOneColumn, IOneToOneColumn { /// /// Name of the new column @@ -8153,7 +8484,7 @@ public enum DataKind : byte } - public sealed class ConvertTransformColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class ConvertTransformColumn : OneToOneColumn, IOneToOneColumn { /// /// The result type @@ -8352,7 +8683,7 @@ public CombinerByContiguousGroupIdPipelineStep(Output output) namespace Transforms { - public sealed class NormalizeTransformAffineColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class NormalizeTransformAffineColumn : OneToOneColumn, IOneToOneColumn { /// /// Whether to map zero to zero, preserving sparsity @@ -8625,7 +8956,7 @@ public sealed class Output namespace Transforms { - public sealed class TermTransformColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class TermTransformColumn : OneToOneColumn, IOneToOneColumn { /// /// Maximum number of terms to keep when auto-training @@ -8979,7 +9310,7 @@ public FeatureSelectorByMutualInformationPipelineStep(Output output) namespace Transforms { - public sealed class LpNormNormalizerTransformGcnColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class LpNormNormalizerTransformGcnColumn : OneToOneColumn, IOneToOneColumn { /// /// Normalize by standard deviation rather than L2 norm @@ -9123,7 +9454,7 @@ public GlobalContrastNormalizerPipelineStep(Output output) namespace Transforms { - public sealed class HashJoinTransformColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class HashJoinTransformColumn : OneToOneColumn, IOneToOneColumn { /// /// Whether the values need to be combined for a single hash @@ -9282,7 +9613,7 @@ public HashConverterPipelineStep(Output output) namespace Transforms { - public sealed class KeyToValueTransformColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class KeyToValueTransformColumn : OneToOneColumn, IOneToOneColumn { /// /// Name of the new column @@ -9461,7 +9792,7 @@ public LabelColumnKeyBooleanConverterPipelineStep(Output output) namespace Transforms { - public sealed class LabelIndicatorTransformColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class LabelIndicatorTransformColumn : OneToOneColumn, IOneToOneColumn { /// /// The positive example class for binary classification. @@ -9645,7 +9976,7 @@ public LabelToFloatConverterPipelineStep(Output output) namespace Transforms { - public sealed class NormalizeTransformLogNormalColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class NormalizeTransformLogNormalColumn : OneToOneColumn, IOneToOneColumn { /// /// Max number of examples used to train the normalizer @@ -9782,7 +10113,7 @@ public enum LpNormNormalizerTransformNormalizerKind : byte } - public sealed class LpNormNormalizerTransformColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class LpNormNormalizerTransformColumn : OneToOneColumn, IOneToOneColumn { /// /// The norm to use to normalize each sample @@ -10185,7 +10516,7 @@ public enum NAHandleTransformReplacementKind } - public sealed class NAHandleTransformColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class NAHandleTransformColumn : OneToOneColumn, IOneToOneColumn { /// /// The replacement method to utilize @@ -10329,7 +10660,7 @@ public MissingValueHandlerPipelineStep(Output output) namespace Transforms { - public sealed class NAIndicatorTransformColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class NAIndicatorTransformColumn : OneToOneColumn, IOneToOneColumn { /// /// Name of the new column @@ -10443,7 +10774,7 @@ public MissingValueIndicatorPipelineStep(Output output) namespace Transforms { - public sealed class NADropTransformColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class NADropTransformColumn : OneToOneColumn, IOneToOneColumn { /// /// Name of the new column @@ -10637,7 +10968,7 @@ public enum NAReplaceTransformReplacementKind } - public sealed class NAReplaceTransformColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class NAReplaceTransformColumn : OneToOneColumn, IOneToOneColumn { /// /// Replacement value for NAs (uses default value if not given) @@ -10810,7 +11141,7 @@ public enum NgramTransformWeightingCriteria } - public sealed class NgramTransformColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class NgramTransformColumn : OneToOneColumn, IOneToOneColumn { /// /// Maximum ngram length @@ -11149,7 +11480,7 @@ public PredictedLabelColumnOriginalValueConverterPipelineStep(Output output) namespace Transforms { - public sealed class GenerateNumberTransformColumn + public sealed partial class GenerateNumberTransformColumn { /// /// Name of the new column @@ -11888,7 +12219,7 @@ public enum TextTransformTextNormKind } - public sealed class TextTransformColumn : ManyToOneColumn, IManyToOneColumn + public sealed partial class TextTransformColumn : ManyToOneColumn, IManyToOneColumn { /// /// Name of the new column @@ -11902,7 +12233,7 @@ public sealed class TextTransformColumn : ManyToOneColumn, } - public sealed class TermLoaderArguments + public sealed partial class TermLoaderArguments { /// /// List of terms @@ -12317,7 +12648,7 @@ public sealed class Output namespace Transforms { - public sealed class DelimitedTokenizeTransformColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class DelimitedTokenizeTransformColumn : OneToOneColumn, IOneToOneColumn { /// /// Comma separated set of term separator(s). Commonly: 'space', 'comma', 'semicolon' or other single character. diff --git a/src/Microsoft.ML/Data/TextLoader.cs b/src/Microsoft.ML/Data/TextLoader.cs new file mode 100644 index 0000000000..3c8550ef09 --- /dev/null +++ b/src/Microsoft.ML/Data/TextLoader.cs @@ -0,0 +1,179 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Api; +using Microsoft.ML.Runtime.Data; +using System; +using System.Linq; +using System.Reflection; +using System.Text.RegularExpressions; + +namespace Microsoft.ML.Data +{ + public sealed partial class TextLoaderRange + { + public TextLoaderRange() + { + } + + /// + /// Convenience constructor for the scalar case, when a given column + /// in the schema spans only a single column in the dataset. + /// and are set to the single value . + /// + /// Column index in the dataset. + public TextLoaderRange(int ordinal) + { + + Contracts.CheckParam(ordinal >= 0, nameof(ordinal), "Cannot be a negative number"); + + Min = ordinal; + Max = ordinal; + } + + /// + /// Convenience constructor for the vector case, when a given column + /// in the schema spans contiguous columns in the dataset. + /// + /// Starting column index in the dataset. + /// Ending column index in the dataset. + public TextLoaderRange(int min, int max) + { + + Contracts.CheckParam(min >= 0, nameof(min), "Cannot be a negative number."); + Contracts.CheckParam(max >= min, nameof(max), "Cannot be less than " + nameof(min) +"."); + + Min = min; + Max = max; + } + } + + public sealed partial class TextLoader + { + /// + /// Construct a TextLoader object by inferencing the dataset schema from a type. + /// + /// Does the file contains header? + /// Column separator character. Default is '\t' + /// Whether the input may include quoted values, + /// which can contain separator characters, colons, + /// and distinguish empty values from missing values. When true, consecutive separators + /// denote a missing value and an empty value is denoted by \"\". + /// When false, consecutive separators denote an empty value. + /// Whether the input may include sparse representations e.g. + /// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero + /// except for 3rd and 5th columns which have values 6 and 3 + /// Remove trailing whitespace from lines + public TextLoader CreateFrom(bool useHeader = false, + char separator = '\t', bool allowQuotedStrings = true, + bool supportSparse = true, bool trimWhitespace = false) + { + var fields = typeof(TInput).GetFields(); + Arguments.Column = new TextLoaderColumn[fields.Length]; + for (int index = 0; index < fields.Length; index++) + { + var field = fields[index]; + var mappingAttr = field.GetCustomAttribute(); + if (mappingAttr == null) + throw Contracts.Except($"{field.Name} is missing ColumnAttribute"); + + if (Regex.Match(mappingAttr.Ordinal, @"[^(0-9,\*\-~)]+").Success) + throw Contracts.Except($"{mappingAttr.Ordinal} contains invalid characters. " + + $"Valid characters are 0-9, *, - and ~"); + + var name = mappingAttr.Name ?? field.Name; + if (name.Any(c => !Char.IsLetterOrDigit(c))) + throw Contracts.Except($"{name} is not alphanumeric."); + + Runtime.Data.TextLoader.Range[] sources; + if (!Runtime.Data.TextLoader.Column.TryParseSourceEx(mappingAttr.Ordinal, out sources)) + throw Contracts.Except($"{mappingAttr.Ordinal} could not be parsed."); + + Contracts.Assert(sources != null); + + TextLoaderColumn tlc = new TextLoaderColumn(); + tlc.Name = name; + tlc.Source = new TextLoaderRange[sources.Length]; + DataKind dk; + if (!TryGetDataKind(field.FieldType.IsArray ? field.FieldType.GetElementType() : field.FieldType, out dk)) + throw Contracts.Except($"{name} is of unsupported type."); + + tlc.Type = dk; + + for (int indexLocal = 0; indexLocal < tlc.Source.Length; indexLocal++) + { + tlc.Source[indexLocal] = new TextLoaderRange + { + AllOther = sources[indexLocal].AllOther, + AutoEnd = sources[indexLocal].AutoEnd, + ForceVector = sources[indexLocal].ForceVector, + VariableEnd = sources[indexLocal].VariableEnd, + Max = sources[indexLocal].Max, + Min = sources[indexLocal].Min + }; + } + + Arguments.Column[index] = tlc; + } + + Arguments.HasHeader = useHeader; + Arguments.Separator = new[] { separator }; + Arguments.AllowQuoting = allowQuotedStrings; + Arguments.AllowSparse = supportSparse; + Arguments.TrimWhitespace = trimWhitespace; + + return this; + } + + /// + /// Try to map a System.Type to a corresponding DataKind value. + /// + private static bool TryGetDataKind(Type type, out DataKind kind) + { + Contracts.AssertValue(type); + + // REVIEW: Make this more efficient. Should we have a global dictionary? + if (type == typeof(DvInt1) || type == typeof(sbyte)) + kind = DataKind.I1; + else if (type == typeof(byte) || type == typeof(char)) + kind = DataKind.U1; + else if (type == typeof(DvInt2) || type == typeof(short)) + kind = DataKind.I2; + else if (type == typeof(ushort)) + kind = DataKind.U2; + else if (type == typeof(DvInt4) || type == typeof(int)) + kind = DataKind.I4; + else if (type == typeof(uint)) + kind = DataKind.U4; + else if (type == typeof(DvInt8) || type == typeof(long)) + kind = DataKind.I8; + else if (type == typeof(ulong)) + kind = DataKind.U8; + else if (type == typeof(Single)) + kind = DataKind.R4; + else if (type == typeof(Double)) + kind = DataKind.R8; + else if (type == typeof(DvText) || type == typeof(string)) + kind = DataKind.TX; + else if (type == typeof(DvBool) || type == typeof(bool)) + kind = DataKind.BL; + else if (type == typeof(DvTimeSpan) || type == typeof(TimeSpan)) + kind = DataKind.TS; + else if (type == typeof(DvDateTime) || type == typeof(DateTime)) + kind = DataKind.DT; + else if (type == typeof(DvDateTimeZone) || type == typeof(TimeZoneInfo)) + kind = DataKind.DZ; + else if (type == typeof(UInt128)) + kind = DataKind.UG; + else + { + kind = default(DataKind); + return false; + } + + return true; + } + } +} diff --git a/src/Microsoft.ML/LearningPipeline.cs b/src/Microsoft.ML/LearningPipeline.cs index 51677afbf4..0e554734ea 100644 --- a/src/Microsoft.ML/LearningPipeline.cs +++ b/src/Microsoft.ML/LearningPipeline.cs @@ -1,4 +1,4 @@ -// Licensed to the .NET Foundation under one or more agreements. +// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. @@ -68,7 +68,7 @@ public LearningPipeline() /// Possible data loader(s), transforms and trainers options are /// /// Data Loader: - /// + /// /// etc. /// /// @@ -154,7 +154,6 @@ public PredictionModel Train() step = currentItem.ApplyStep(step, experiment); if (step is ILearningPipelineDataStep dataStep && dataStep.Model != null) transformModels.Add(dataStep.Model); - else if (step is ILearningPipelinePredictorStep predictorDataStep) { if (lastTransformModel != null) diff --git a/src/Microsoft.ML/Runtime/EntryPoints/ImportTextData.cs b/src/Microsoft.ML/Runtime/EntryPoints/ImportTextData.cs index 8038294398..41048000d8 100644 --- a/src/Microsoft.ML/Runtime/EntryPoints/ImportTextData.cs +++ b/src/Microsoft.ML/Runtime/EntryPoints/ImportTextData.cs @@ -27,13 +27,25 @@ public sealed class Input public string CustomSchema = null; } + [TlcModule.EntryPointKind(typeof(ILearningPipelineLoader))] + public sealed class LoaderInput + { + [Argument(ArgumentType.Required, ShortName = "data", HelpText = "Location of the input file", SortOrder = 1)] + public IFileHandle InputFile; + + [Argument(ArgumentType.Required, ShortName = "args", HelpText = "Arguments", SortOrder = 2)] + public TextLoader.Arguments Arguments = new TextLoader.Arguments(); + } + public sealed class Output { [TlcModule.Output(Desc = "The resulting data view", SortOrder = 1)] public IDataView Data; } - [TlcModule.EntryPoint(Name = "Data.TextLoader", Desc = "Import a dataset from a text file")] +#pragma warning disable 0618 + [Obsolete("Use TextLoader instead.")] + [TlcModule.EntryPoint(Name = "Data.CustomTextLoader", Desc = "Import a dataset from a text file")] public static Output ImportText(IHostEnvironment env, Input input) { Contracts.CheckValue(env, nameof(env)); @@ -43,5 +55,17 @@ public static Output ImportText(IHostEnvironment env, Input input) var loader = host.CreateLoader(string.Format("Text{{{0}}}", input.CustomSchema), new FileHandleSource(input.InputFile)); return new Output { Data = loader }; } +#pragma warning restore 0618 + + [TlcModule.EntryPoint(Name = "Data.TextLoader", Desc = "Import a dataset from a text file")] + public static Output TextLoader(IHostEnvironment env, LoaderInput input) + { + Contracts.CheckValue(env, nameof(env)); + var host = env.Register("ImportTextData"); + env.CheckValue(input, nameof(input)); + EntryPointUtils.CheckInputArgs(host, input); + var loader = host.CreateLoader(input.Arguments, new FileHandleSource(input.InputFile)); + return new Output { Data = loader }; + } } } diff --git a/src/Microsoft.ML/Runtime/EntryPoints/TrainTestSplit.cs b/src/Microsoft.ML/Runtime/EntryPoints/TrainTestSplit.cs index 8b199045fa..40909ad108 100644 --- a/src/Microsoft.ML/Runtime/EntryPoints/TrainTestSplit.cs +++ b/src/Microsoft.ML/Runtime/EntryPoints/TrainTestSplit.cs @@ -93,7 +93,8 @@ public static string CreateStratificationColumn(IHost host, ref IDataView data, new HashJoinTransform.Arguments { Column = new[] { new HashJoinTransform.Column { Name = stratCol, Source = stratificationColumn } }, - Join = true + Join = true, + HashBits = 30 }, data); } diff --git a/src/Microsoft.ML/Runtime/Internal/Tools/CSharpApiGenerator.cs b/src/Microsoft.ML/Runtime/Internal/Tools/CSharpApiGenerator.cs index f1e45fa446..5fabb15840 100644 --- a/src/Microsoft.ML/Runtime/Internal/Tools/CSharpApiGenerator.cs +++ b/src/Microsoft.ML/Runtime/Internal/Tools/CSharpApiGenerator.cs @@ -177,6 +177,37 @@ public static string Capitalize(string s) return char.ToUpperInvariant(s[0]) + s.Substring(1); } + private static string GetCharAsString(char value) + { + switch (value) + { + case '\t': + return "\\t"; + case '\n': + return "\\n"; + case '\r': + return "\\r"; + case '\\': + return "\\"; + case '\"': + return "\""; + case '\'': + return "\\'"; + case '\0': + return "\\0"; + case '\a': + return "\\a"; + case '\b': + return "\\b"; + case '\f': + return "\\f"; + case '\v': + return "\\v"; + default: + return value.ToString(); + } + } + public static string GetValue(ModuleCatalog catalog, Type fieldType, object fieldValue, Dictionary typesSymbolTable, string rootNameSpace = "") { @@ -264,7 +295,7 @@ public static string GetValue(ModuleCatalog catalog, Type fieldType, object fiel case TlcModule.DataKind.Enum: return GetEnumName(fieldType, typesSymbolTable, rootNameSpace) + "." + fieldValue; case TlcModule.DataKind.Char: - return $"'{(char)fieldValue}'"; + return $"'{GetCharAsString((char)fieldValue)}'"; case TlcModule.DataKind.Component: var type = fieldValue.GetType(); ModuleCatalog.ComponentInfo componentInfo; @@ -685,7 +716,7 @@ private void GenerateStructs(IndentingTextWriter writer, classBase = $" : OneToOneColumn<{_typesSymbolTable[type.FullName].Substring(_typesSymbolTable[type.FullName].LastIndexOf('.') + 1)}>, IOneToOneColumn"; else if (type.IsSubclassOf(typeof(ManyToOneColumn))) classBase = $" : ManyToOneColumn<{_typesSymbolTable[type.FullName].Substring(_typesSymbolTable[type.FullName].LastIndexOf('.') + 1)}>, IManyToOneColumn"; - writer.WriteLine($"public sealed class {_typesSymbolTable[type.FullName].Substring(_typesSymbolTable[type.FullName].LastIndexOf('.') + 1)}{classBase}"); + writer.WriteLine($"public sealed partial class {_typesSymbolTable[type.FullName].Substring(_typesSymbolTable[type.FullName].LastIndexOf('.') + 1)}{classBase}"); writer.WriteLine("{"); writer.Indent(); GenerateInputFields(writer, type, catalog, _typesSymbolTable); @@ -696,6 +727,58 @@ private void GenerateStructs(IndentingTextWriter writer, } } + private void GenerateLoaderAddInputMethod(IndentingTextWriter writer, string className) + { + //Constructor. + writer.WriteLine("[JsonIgnore]"); + writer.WriteLine("private string _inputFilePath = null;"); + writer.WriteLine($"public {className}(string filePath)"); + writer.WriteLine("{"); + writer.Indent(); + writer.WriteLine("_inputFilePath = filePath;"); + writer.Outdent(); + writer.WriteLine("}"); + writer.WriteLine(""); + + //SetInput. + writer.WriteLine($"public void SetInput(IHostEnvironment env, Experiment experiment)"); + writer.WriteLine("{"); + writer.Indent(); + writer.WriteLine("IFileHandle inputFile = new SimpleFileHandle(env, _inputFilePath, false, false);"); + writer.WriteLine("experiment.SetInput(InputFile, inputFile);"); + writer.Outdent(); + writer.WriteLine("}"); + writer.WriteLine(""); + + //Apply. + writer.WriteLine($"public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment)"); + writer.WriteLine("{"); + writer.Indent(); + writer.WriteLine("Contracts.Assert(previousStep == null);"); + writer.WriteLine(""); + writer.WriteLine($"return new {className}PipelineStep(experiment.Add(this));"); + writer.Outdent(); + writer.WriteLine("}"); + writer.WriteLine(""); + + //Pipelinestep class. + writer.WriteLine($"private class {className}PipelineStep : ILearningPipelineDataStep"); + writer.WriteLine("{"); + writer.Indent(); + writer.WriteLine($"public {className}PipelineStep (Output output)"); + writer.WriteLine("{"); + writer.Indent(); + writer.WriteLine("Data = output.Data;"); + writer.WriteLine("Model = null;"); + writer.Outdent(); + writer.WriteLine("}"); + writer.WriteLine(); + writer.WriteLine("public Var Data { get; }"); + writer.WriteLine("public Var Model { get; }"); + writer.Outdent(); + writer.WriteLine("}"); + } + private void GenerateColumnAddMethods(IndentingTextWriter writer, Type inputType, ModuleCatalog catalog, @@ -842,10 +925,11 @@ private void GenerateInput(IndentingTextWriter writer, var classAndMethod = GeneratorUtils.GetClassAndMethodNames(entryPointInfo); string classBase = ""; if (entryPointInfo.InputKinds != null) + { classBase += $" : {string.Join(", ", entryPointInfo.InputKinds.Select(GeneratorUtils.GetCSharpTypeName))}"; - - if (classBase.Contains("ITransformInput") || classBase.Contains("ITrainerInput")) - classBase += ", Microsoft.ML.ILearningPipelineItem"; + if (entryPointInfo.InputKinds.Any(t => typeof(ITrainerInput).IsAssignableFrom(t) || typeof(ITransformInput).IsAssignableFrom(t))) + classBase += ", Microsoft.ML.ILearningPipelineItem"; + } GenerateEnums(writer, entryPointInfo.InputType, classAndMethod.Item1); writer.WriteLine(); @@ -854,10 +938,17 @@ private void GenerateInput(IndentingTextWriter writer, foreach (var line in entryPointInfo.Description.Split(new[] { Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries)) writer.WriteLine($"/// {line}"); writer.WriteLine("/// "); + + if(entryPointInfo.ObsoleteAttribute != null) + writer.WriteLine($"[Obsolete(\"{entryPointInfo.ObsoleteAttribute.Message}\")]"); + writer.WriteLine($"public sealed partial class {classAndMethod.Item2}{classBase}"); writer.WriteLine("{"); writer.Indent(); writer.WriteLine(); + if (entryPointInfo.InputKinds != null && entryPointInfo.InputKinds.Any(t => typeof(ILearningPipelineLoader).IsAssignableFrom(t))) + GenerateLoaderAddInputMethod(writer, classAndMethod.Item2); + GenerateColumnAddMethods(writer, entryPointInfo.InputType, catalog, classAndMethod.Item2, out Type transformType); writer.WriteLine(); GenerateInputFields(writer, entryPointInfo.InputType, catalog, _typesSymbolTable); diff --git a/src/Microsoft.ML/TextLoader.cs b/src/Microsoft.ML/TextLoader.cs deleted file mode 100644 index 4e3e3fb8e4..0000000000 --- a/src/Microsoft.ML/TextLoader.cs +++ /dev/null @@ -1,124 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using Microsoft.ML.Runtime; -using Microsoft.ML.Runtime.Api; -using Microsoft.ML.Runtime.Data; -using Microsoft.ML.Runtime.EntryPoints; -using System; -using System.Linq; -using System.Reflection; -using System.Text; - -namespace Microsoft.ML -{ - public class TextLoader : ILearningPipelineLoader - { - private string _inputFilePath; - private string CustomSchema; - private Data.TextLoader ImportTextInput; - - /// - /// Construct a TextLoader object - /// - /// Data file path - /// Does the file contains header? - /// How the columns are seperated? - /// Options: separator="tab", separator="space", separator="comma" or separator=[single character]. - /// By default separator=null means "tab" - /// Whether the input may include quoted values, - /// which can contain separator characters, colons, - /// and distinguish empty values from missing values. When true, consecutive separators - /// denote a missing value and an empty value is denoted by \"\". - /// When false, consecutive separators denote an empty value. - /// Whether the input may include sparse representations e.g. - /// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero - /// except for 3rd and 5th columns which have values 6 and 3 - /// Remove trailing whitespace from lines - public TextLoader(string inputFilePath, bool useHeader = false, - string separator = null, bool allowQuotedStrings = true, - bool supportSparse = true, bool trimWhitespace = false) - { - _inputFilePath = inputFilePath; - SetCustomStringFromType(useHeader, separator, allowQuotedStrings, supportSparse, trimWhitespace); - } - - private IFileHandle GetTextLoaderFileHandle(IHostEnvironment env, string trainFilePath) => - new SimpleFileHandle(env, trainFilePath, false, false); - - private void SetCustomStringFromType(bool useHeader, string separator, - bool allowQuotedStrings, bool supportSparse, bool trimWhitespace) - { - StringBuilder schemaBuilder = new StringBuilder(CustomSchema); - foreach (var field in typeof(TInput).GetFields()) - { - var mappingAttr = field.GetCustomAttribute(); - if(mappingAttr == null) - throw Contracts.ExceptParam(field.Name, $"{field.Name} is missing ColumnAttribute"); - - schemaBuilder.AppendFormat("col={0}:{1}:{2} ", - mappingAttr.Name ?? field.Name, - TypeToName(field.FieldType.IsArray ? field.FieldType.GetElementType() : field.FieldType), - mappingAttr.Ordinal); - } - - if (useHeader) - schemaBuilder.Append(nameof(TextLoader.Arguments.HasHeader)).Append("+ "); - - if (separator != null) - schemaBuilder.Append(nameof(TextLoader.Arguments.Separator)).Append("=").Append(separator).Append(" "); - - if (!allowQuotedStrings) - schemaBuilder.Append(nameof(TextLoader.Arguments.AllowQuoting)).Append("- "); - - if (!supportSparse) - schemaBuilder.Append(nameof(TextLoader.Arguments.AllowSparse)).Append("- "); - - if (trimWhitespace) - schemaBuilder.Append(nameof(TextLoader.Arguments.TrimWhitespace)).Append("+ "); - - schemaBuilder.Length--; - CustomSchema = schemaBuilder.ToString(); - } - - private string TypeToName(Type type) - { - if (type == typeof(string)) - return "TX"; - else if (type == typeof(float) || type == typeof(double)) - return "R4"; - else if (type == typeof(bool)) - return "BL"; - else - throw new System.NotSupportedException("Type ${type.FullName} is not implemented or supported."); //Add more types. - } - - public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) - { - Contracts.Assert(previousStep == null); - - ImportTextInput = new Data.TextLoader(); - ImportTextInput.CustomSchema = CustomSchema; - var importOutput = experiment.Add(ImportTextInput); - return new TextLoaderPipelineStep(importOutput.Data); - } - - public void SetInput(IHostEnvironment env, Experiment experiment) - { - IFileHandle inputFile = GetTextLoaderFileHandle(env, _inputFilePath); - experiment.SetInput(ImportTextInput.InputFile, inputFile); - } - - private class TextLoaderPipelineStep : ILearningPipelineDataStep - { - public TextLoaderPipelineStep(Var data) - { - Data = data; - } - - public Var Data { get; } - public Var Model => null; - } - } -} diff --git a/src/Native/build.cmd b/src/Native/build.cmd index 166773183c..e2bbc3a4dc 100644 --- a/src/Native/build.cmd +++ b/src/Native/build.cmd @@ -27,7 +27,6 @@ shift goto :Arg_Loop :ToolsVersion - if defined VisualStudioVersion goto :RunVCVars set _VSWHERE="%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" @@ -37,7 +36,8 @@ if exist %_VSWHERE% ( if not exist "%_VSCOMNTOOLS%" set _VSCOMNTOOLS=%VS140COMNTOOLS% if not exist "%_VSCOMNTOOLS%" goto :MissingVersion -set VSCMD_START_DIR="%__currentScriptDir%" + +set "VSCMD_START_DIR=%__currentScriptDir%" call "%_VSCOMNTOOLS%\VsDevCmd.bat" :RunVCVars @@ -92,8 +92,8 @@ if not exist "%__IntermediatesDir%" md "%__IntermediatesDir%" :: Regenerate the VS solution -set __gen-buildsys-win-path=%__currentScriptDir%\gen-buildsys-win.bat -set __source-code-path=%__currentScriptDir% +set "__gen-buildsys-win-path=%__currentScriptDir%\gen-buildsys-win.bat" +set "__source-code-path=%__currentScriptDir%" echo Calling "%__gen-buildsys-win-path%" "%__source-code-path%" "%__VSVersion%" %__BuildArch% pushd "%__IntermediatesDir%" diff --git a/src/Native/build.proj b/src/Native/build.proj index 1bfab0639c..83efe29a46 100644 --- a/src/Native/build.proj +++ b/src/Native/build.proj @@ -44,7 +44,7 @@ - + diff --git a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs index e0583f58b7..adfa42e50d 100644 --- a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs +++ b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs @@ -4,6 +4,7 @@ using BenchmarkDotNet.Attributes; using BenchmarkDotNet.Running; +using Microsoft.ML.Data; using Microsoft.ML.Models; using Microsoft.ML.Runtime.Api; using Microsoft.ML.Trainers; @@ -50,7 +51,7 @@ public void Setup() s_trainedModel = TrainCore(); IrisPrediction prediction = s_trainedModel.Predict(s_example); - var testData = new TextLoader(s_dataPath, useHeader: true, separator: "tab"); + var testData = new TextLoader(s_dataPath).CreateFrom(useHeader: true); var evaluator = new ClassificationEvaluator(); s_metrics = evaluator.Evaluate(s_trainedModel, testData); @@ -70,7 +71,7 @@ private static PredictionModel TrainCore() { var pipeline = new LearningPipeline(); - pipeline.Add(new TextLoader(s_dataPath, useHeader: true, separator: "tab")); + pipeline.Add(new TextLoader(s_dataPath).CreateFrom(useHeader: true)); pipeline.Add(new ColumnConcatenator(outputColumn: "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth")); diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestCSharpApi.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestCSharpApi.cs index ee4f56c260..439dc069f4 100644 --- a/test/Microsoft.ML.Core.Tests/UnitTests/TestCSharpApi.cs +++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestCSharpApi.cs @@ -36,7 +36,7 @@ public void TestSimpleExperiment() { var experiment = env.CreateExperiment(); - var importInput = new ML.Data.TextLoader(); + var importInput = new ML.Data.TextLoader(dataPath); var importOutput = experiment.Add(importInput); var normalizeInput = new ML.Transforms.MinMaxNormalizer @@ -67,7 +67,7 @@ public void TestSimpleTrainExperiment() { var experiment = env.CreateExperiment(); - var importInput = new ML.Data.TextLoader(); + var importInput = new ML.Data.TextLoader(dataPath); var importOutput = experiment.Add(importInput); var catInput = new ML.Transforms.CategoricalOneHotVectorizer @@ -165,7 +165,7 @@ public void TestTrainTestMacro() var experiment = env.CreateExperiment(); - var importInput = new ML.Data.TextLoader(); + var importInput = new ML.Data.TextLoader(dataPath); var importOutput = experiment.Add(importInput); var trainTestInput = new ML.Models.TrainTestBinaryEvaluator @@ -235,7 +235,7 @@ public void TestCrossValidationBinaryMacro() var experiment = env.CreateExperiment(); - var importInput = new ML.Data.TextLoader(); + var importInput = new ML.Data.TextLoader(dataPath); var importOutput = experiment.Add(importInput); var crossValidateBinary = new ML.Models.BinaryCrossValidator @@ -295,7 +295,7 @@ public void TestCrossValidationMacro() var modelCombineOutput = subGraph.Add(modelCombine); var experiment = env.CreateExperiment(); - var importInput = new ML.Data.TextLoader(); + var importInput = new ML.Data.TextLoader(dataPath); var importOutput = experiment.Add(importInput); var crossValidate = new ML.Models.CrossValidator @@ -330,5 +330,73 @@ public void TestCrossValidationMacro() } } } + + [Fact] + public void TestCrossValidationMacroWithStratification() + { + var dataPath = GetDataPath(@"breast-cancer.txt"); + using (var env = new TlcEnvironment()) + { + var subGraph = env.CreateExperiment(); + + var nop = new ML.Transforms.NoOperation(); + var nopOutput = subGraph.Add(nop); + + var learnerInput = new ML.Trainers.StochasticDualCoordinateAscentBinaryClassifier + { + TrainingData = nopOutput.OutputData, + NumThreads = 1 + }; + var learnerOutput = subGraph.Add(learnerInput); + + var modelCombine = new ML.Transforms.ManyHeterogeneousModelCombiner + { + TransformModels = new ArrayVar(nopOutput.Model), + PredictorModel = learnerOutput.PredictorModel + }; + var modelCombineOutput = subGraph.Add(modelCombine); + + var experiment = env.CreateExperiment(); + var importInput = new ML.Data.TextLoader(dataPath); + importInput.Arguments.Column = new ML.Data.TextLoaderColumn[] + { + new ML.Data.TextLoaderColumn { Name = "Label", Source = new[] { new ML.Data.TextLoaderRange(0) } }, + new ML.Data.TextLoaderColumn { Name = "Strat", Source = new[] { new ML.Data.TextLoaderRange(1) } }, + new ML.Data.TextLoaderColumn { Name = "Features", Source = new[] { new ML.Data.TextLoaderRange(2, 9) } } + }; + var importOutput = experiment.Add(importInput); + + var crossValidate = new ML.Models.CrossValidator + { + Data = importOutput.Data, + Nodes = subGraph, + TransformModel = null, + StratificationColumn = "Strat" + }; + crossValidate.Inputs.Data = nop.Data; + crossValidate.Outputs.Model = modelCombineOutput.PredictorModel; + var crossValidateOutput = experiment.Add(crossValidate); + + experiment.Compile(); + experiment.SetInput(importInput.InputFile, new SimpleFileHandle(env, dataPath, false, false)); + experiment.Run(); + var data = experiment.GetOutput(crossValidateOutput.OverallMetrics[0]); + + var schema = data.Schema; + var b = schema.TryGetColumnIndex("AUC", out int metricCol); + Assert.True(b); + using (var cursor = data.GetRowCursor(col => col == metricCol)) + { + var getter = cursor.GetGetter(metricCol); + b = cursor.MoveNext(); + Assert.True(b); + double val = 0; + getter(ref val); + Assert.Equal(0.99, val, 2); + b = cursor.MoveNext(); + Assert.False(b); + } + } + } } } diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs index e8be6c0370..24e8374b4c 100644 --- a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs +++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs @@ -33,7 +33,35 @@ public void EntryPointTrainTestSplit() { var dataPath = GetDataPath("breast-cancer.txt"); var inputFile = new SimpleFileHandle(Env, dataPath, false, false); - var dataView = ImportTextData.ImportText(Env, new ImportTextData.Input { InputFile = inputFile, CustomSchema = "col=Label:0 col=Features:TX:1-9" }).Data; + /*var dataView = ImportTextData.ImportText(Env, new ImportTextData.Input + { InputFile = inputFile, CustomSchema = "col=Label:0 col=Features:TX:1-9" }).Data;*/ + + var dataView = ImportTextData.TextLoader(Env, new ImportTextData.LoaderInput() + { + Arguments = + { + SeparatorChars = new []{',' }, + HasHeader = true, + Column = new[] + { + new TextLoader.Column() + { + Name = "Label", + Source = new [] { new TextLoader.Range() { Min = 0, Max = 0} }, + Type = Runtime.Data.DataKind.Text + }, + + new TextLoader.Column() + { + Name = "Features", + Source = new [] { new TextLoader.Range() { Min = 1, Max = 9} }, + Type = Runtime.Data.DataKind.Text + } + } + }, + + InputFile = inputFile + }).Data; var splitOutput = TrainTestSplit.Split(Env, new TrainTestSplit.Input { Data = dataView, Fraction = 0.9f }); @@ -62,7 +90,44 @@ public void EntryPointFeatureCombiner() { var dataPath = GetDataPath("breast-cancer.txt"); var inputFile = new SimpleFileHandle(Env, dataPath, false, false); - var dataView = ImportTextData.ImportText(Env, new ImportTextData.Input { InputFile = inputFile, CustomSchema = "col=Label:0 col=F1:TX:1 col=F2:I4:2 col=Rest:3-9" }).Data; + var dataView = ImportTextData.TextLoader(Env, new ImportTextData.LoaderInput() + { + Arguments = + { + HasHeader = true, + Column = new[] + { + new TextLoader.Column() + { + Name = "Label", + Source = new [] { new TextLoader.Range() { Min = 0, Max = 0} } + }, + + new TextLoader.Column() + { + Name = "F1", + Source = new [] { new TextLoader.Range() { Min = 1, Max = 1} }, + Type = Runtime.Data.DataKind.Text + }, + + new TextLoader.Column() + { + Name = "F2", + Source = new [] { new TextLoader.Range() { Min = 2, Max = 2} }, + Type = Runtime.Data.DataKind.I4 + }, + + new TextLoader.Column() + { + Name = "Rest", + Source = new [] { new TextLoader.Range() { Min = 3, Max = 9} } + } + } + }, + + InputFile = inputFile + }).Data; + dataView = Env.CreateTransform("Term{col=F1}", dataView); var result = FeatureCombiner.PrepareFeatures(Env, new FeatureCombiner.FeatureCombinerInput() { Data = dataView, Features = new[] { "F1", "F2", "Rest" } }).OutputData; var expected = Env.CreateTransform("Convert{col=F2 type=R4}", dataView); @@ -82,7 +147,44 @@ public void EntryPointScoring() { var dataPath = GetDataPath("breast-cancer.txt"); var inputFile = new SimpleFileHandle(Env, dataPath, false, false); - var dataView = ImportTextData.ImportText(Env, new ImportTextData.Input { InputFile = inputFile, CustomSchema = "col=Label:0 col=F1:TX:1 col=F2:I4:2 col=Rest:3-9" }).Data; + var dataView = ImportTextData.TextLoader(Env, new ImportTextData.LoaderInput() + { + Arguments = + { + HasHeader = true, + Column = new[] + { + new TextLoader.Column() + { + Name = "Label", + Source = new [] { new TextLoader.Range() { Min = 0, Max = 0} } + }, + + new TextLoader.Column() + { + Name = "F1", + Source = new [] { new TextLoader.Range() { Min = 1, Max = 1} }, + Type = Runtime.Data.DataKind.Text + }, + + new TextLoader.Column() + { + Name = "F2", + Source = new [] { new TextLoader.Range() { Min = 2, Max = 2} }, + Type = Runtime.Data.DataKind.I4 + }, + + new TextLoader.Column() + { + Name = "Rest", + Source = new [] { new TextLoader.Range() { Min = 3, Max = 9} } + } + } + }, + + InputFile = inputFile + }).Data; + dataView = Env.CreateTransform("Term{col=F1}", dataView); var trainData = FeatureCombiner.PrepareFeatures(Env, new FeatureCombiner.FeatureCombinerInput() { Data = dataView, Features = new[] { "F1", "F2", "Rest" } }); @@ -105,7 +207,44 @@ public void EntryPointApplyModel() { var dataPath = GetDataPath("breast-cancer.txt"); var inputFile = new SimpleFileHandle(Env, dataPath, false, false); - var dataView = ImportTextData.ImportText(Env, new ImportTextData.Input { InputFile = inputFile, CustomSchema = "col=Label:0 col=F1:TX:1 col=F2:I4:2 col=Rest:3-9" }).Data; + var dataView = ImportTextData.TextLoader(Env, new ImportTextData.LoaderInput() + { + Arguments = + { + HasHeader = true, + Column = new[] + { + new TextLoader.Column() + { + Name = "Label", + Source = new [] { new TextLoader.Range() { Min = 0, Max = 0} }, + }, + + new TextLoader.Column() + { + Name = "F1", + Source = new [] { new TextLoader.Range() { Min = 1, Max = 1} }, + Type = Runtime.Data.DataKind.Text + }, + + new TextLoader.Column() + { + Name = "F2", + Source = new [] { new TextLoader.Range() { Min = 2, Max = 2} }, + Type = Runtime.Data.DataKind.I4 + }, + + new TextLoader.Column() + { + Name = "Rest", + Source = new [] { new TextLoader.Range() { Min = 3, Max = 9} } + } + } + }, + + InputFile = inputFile + }).Data; + dataView = Env.CreateTransform("Term{col=F1}", dataView); var data1 = FeatureCombiner.PrepareFeatures(Env, new FeatureCombiner.FeatureCombinerInput() { Data = dataView, Features = new[] { "F1", "F2", "Rest" } }); @@ -120,7 +259,49 @@ public void EntryPointCaching() { var dataPath = GetDataPath("breast-cancer.txt"); var inputFile = new SimpleFileHandle(Env, dataPath, false, false); - var dataView = ImportTextData.ImportText(Env, new ImportTextData.Input { InputFile = inputFile, CustomSchema = "col=Label:0 col=F1:TX:1 col=F2:I4:2 col=Rest:3-9" }).Data; + /*var dataView = ImportTextData.ImportText(Env, new ImportTextData.Input { InputFile = inputFile, + CustomSchema = "col=Label:0 col=F1:TX:1 col=F2:I4:2 col=Rest:3-9" }).Data; + */ + + var dataView = ImportTextData.TextLoader(Env, new ImportTextData.LoaderInput() + { + Arguments = + { + SeparatorChars = new []{',' }, + HasHeader = true, + Column = new[] + { + new TextLoader.Column() + { + Name = "Label", + Source = new [] { new TextLoader.Range() { Min = 0, Max = 0} } + }, + + new TextLoader.Column() + { + Name = "F1", + Source = new [] { new TextLoader.Range() { Min = 1, Max = 1} }, + Type = Runtime.Data.DataKind.Text + }, + + new TextLoader.Column() + { + Name = "F2", + Source = new [] { new TextLoader.Range() { Min = 2, Max = 2} }, + Type = Runtime.Data.DataKind.I4 + }, + + new TextLoader.Column() + { + Name = "Rest", + Source = new [] { new TextLoader.Range() { Min = 3, Max = 9} } + } + } + }, + + InputFile = inputFile + }).Data; + dataView = Env.CreateTransform("Term{col=F1}", dataView); var cached1 = Cache.CacheData(Env, new Cache.CacheInput() { Data = dataView, Caching = Cache.CachingType.Memory }); @@ -305,7 +486,7 @@ public void EntryPointOptionalParams() { 'Nodes': [ { - 'Name': 'Data.TextLoader', + 'Name': 'Data.CustomTextLoader', 'Inputs': { 'InputFile': '$file1' }, @@ -355,7 +536,7 @@ public void EntryPointExecGraphCommand() {{ 'Nodes': [ {{ - 'Name': 'Data.TextLoader', + 'Name': 'Data.CustomTextLoader', 'Inputs': {{ 'InputFile': '$file1' }}, @@ -512,7 +693,7 @@ public void EntryPointParseColumns() {{ 'Nodes': [ {{ - 'Name': 'Data.TextLoader', + 'Name': 'Data.CustomTextLoader', 'Inputs': {{ 'InputFile': '$file1' }}, @@ -562,7 +743,7 @@ public void EntryPointCountFeatures() {{ 'Nodes': [ {{ - 'Name': 'Data.TextLoader', + 'Name': 'Data.CustomTextLoader', 'Inputs': {{ 'InputFile': '$file1' }}, @@ -607,7 +788,7 @@ public void EntryPointMutualSelectFeatures() {{ 'Nodes': [ {{ - 'Name': 'Data.TextLoader', + 'Name': 'Data.CustomTextLoader', 'Inputs': {{ 'InputFile': '$file1' }}, @@ -653,7 +834,7 @@ public void EntryPointTextToKeyToText() {{ 'Nodes': [ {{ - 'Name': 'Data.TextLoader', + 'Name': 'Data.CustomTextLoader', 'Inputs': {{ 'InputFile': '$file1', 'CustomSchema': 'sep=comma col=Cat:TX:4' @@ -735,7 +916,7 @@ private void RunTrainScoreEvaluate(string learner, string evaluator, string data {{ 'Nodes': [ {{ - 'Name': 'Data.TextLoader', + 'Name': 'Data.CustomTextLoader', 'Inputs': {{ 'InputFile': '$file' }}, @@ -1214,7 +1395,7 @@ internal void TestEntryPointPipelineRoutine(string dataFile, string schema, stri {{ 'Nodes': [ {{ - 'Name': 'Data.TextLoader', + 'Name': 'Data.CustomTextLoader', 'Inputs': {{ 'InputFile': '$file1', 'CustomSchema': '{schema}' @@ -1287,7 +1468,7 @@ internal void TestEntryPointRoutine(string dataFile, string trainerName, string {{ 'Nodes': [ {{ - 'Name': 'Data.TextLoader', + 'Name': 'Data.CustomTextLoader', 'Inputs': {{ 'InputFile': '$file1' {3} @@ -1459,7 +1640,7 @@ public void EntryPointNormalizeIfNeeded() { 'Nodes': [ { - 'Name': 'Data.TextLoader', + 'Name': 'Data.CustomTextLoader', 'Inputs': { 'InputFile': '$file' }, @@ -1522,7 +1703,7 @@ public void EntryPointTrainTestBinaryMacro() { 'Nodes': [ { - 'Name': 'Data.TextLoader', + 'Name': 'Data.CustomTextLoader', 'Inputs': { 'InputFile': '$file' }, @@ -1630,7 +1811,7 @@ public void EntryPointTrainTestMacroNoTransformInput() { 'Nodes': [ { - 'Name': 'Data.TextLoader', + 'Name': 'Data.CustomTextLoader', 'Inputs': { 'InputFile': '$file' }, @@ -1744,7 +1925,7 @@ public void EntryPointTrainTestMacro() { 'Nodes': [ { - 'Name': 'Data.TextLoader', + 'Name': 'Data.CustomTextLoader', 'Inputs': { 'InputFile': '$file' }, @@ -1843,7 +2024,7 @@ public void EntryPointChainedTrainTestMacros() { 'Nodes': [ { - 'Name': 'Data.TextLoader', + 'Name': 'Data.CustomTextLoader', 'Inputs': { 'InputFile': '$file' }, @@ -2019,7 +2200,7 @@ public void EntryPointChainedCrossValMacros() { 'Nodes': [ { - 'Name': 'Data.TextLoader', + 'Name': 'Data.CustomTextLoader', 'Inputs': { 'InputFile': '$file' }, @@ -2214,7 +2395,7 @@ public void EntryPointMacroEarlyExpansion() { 'Nodes': [ { - 'Name': 'Data.TextLoader', + 'Name': 'Data.CustomTextLoader', 'Inputs': { 'InputFile': '$file' }, @@ -2302,7 +2483,7 @@ public void EntryPointSerialization() { 'Nodes': [ { - 'Name': 'Data.TextLoader', + 'Name': 'Data.CustomTextLoader', 'Inputs': { 'InputFile': '$file' }, @@ -2368,7 +2549,7 @@ public void EntryPointNodeSchedulingFields() { 'Nodes': [ { - 'Name': 'Data.TextLoader', + 'Name': 'Data.CustomTextLoader', 'StageId': '5063dee8f19c4dd89a1fc3a9da5351a7', 'Inputs': { 'InputFile': '$file' @@ -2437,7 +2618,7 @@ public void EntryPointPrepareLabelConvertPredictedLabel() {{ 'Nodes': [ {{ - 'Name': 'Data.TextLoader', + 'Name': 'Data.CustomTextLoader', 'Inputs': {{ 'InputFile': '$file1', 'CustomSchema': 'sep=comma col=Label:TX:4 col=Features:Num:0-3' @@ -2527,7 +2708,9 @@ public void EntryPointTreeLeafFeaturizer() { var dataPath = GetDataPath(@"adult.tiny.with-schema.txt"); var inputFile = new SimpleFileHandle(Env, dataPath, false, false); +#pragma warning disable 0618 var dataView = ImportTextData.ImportText(Env, new ImportTextData.Input { InputFile = inputFile }).Data; +#pragma warning restore 0618 var cat = Categorical.CatTransformDict(Env, new CategoricalTransform.Arguments() { Data = dataView, diff --git a/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs b/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs index 5166540ccd..3620a3580a 100644 --- a/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs +++ b/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs @@ -20,14 +20,14 @@ public TestAutoInference(ITestOutputHelper helper) { } - [Fact(Skip = "Need CoreTLC specific baseline update")] + [Fact] [TestCategory("EntryPoints")] public void TestLearn() { using (var env = new TlcEnvironment()) { - string pathData = GetDataPath(@"../UCI/adult.train"); - string pathDataTest = GetDataPath(@"../UCI/adult.test"); + string pathData = GetDataPath(@"../../Samples/UCI/adult.train"); + string pathDataTest = GetDataPath(@"../../Samples/UCI/adult.test"); int numOfSampleRows = 1000; int batchSize = 5; int numIterations = 10; @@ -49,46 +49,107 @@ public void TestLearn() // Use best pipeline for another task var inputFileTrain = new SimpleFileHandle(env, pathData, false, false); +#pragma warning disable 0618 var datasetTrain = ImportTextData.ImportText(env, new ImportTextData.Input { InputFile = inputFileTrain, CustomSchema = schema }).Data; var inputFileTest = new SimpleFileHandle(env, pathDataTest, false, false); var datasetTest = ImportTextData.ImportText(env, new ImportTextData.Input { InputFile = inputFileTest, CustomSchema = schema }).Data; +#pragma warning restore 0618 // REVIEW: Theoretically, it could be the case that a new, very bad learner is introduced and // we get unlucky and only select it every time, such that this test fails. Not // likely at all, but a non-zero probability. Should be ok, since all current learners are returning d > .80. - double d = bestPipeline.RunTrainTestExperiment(datasetTrain, datasetTest, metric, MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer); - env.Check(d > 0.2); + bestPipeline.RunTrainTestExperiment(datasetTrain, datasetTest, metric, MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer, + out var testMetricValue, out var trainMtericValue); + env.Check(testMetricValue > 0.2); } Done(); } - [Fact(Skip = "Need CoreTLC specific baseline update")] + [Fact] + [TestCategory("EntryPoints")] + public void TestPipelineSweeperMacroNoTransforms() + { + // Set up inputs for experiment + string pathData = GetDataPath(@"../../Samples/UCI/adult.train"); + string pathDataTest = GetDataPath(@"../../Samples/UCI/adult.test"); + const int numOfSampleRows = 1000; + const string schema = "sep=, col=Features:R4:0,2,4,10-12 col=Label:R4:14 header=+"; + + var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false); +#pragma warning disable 0618 + var datasetTrain = ImportTextData.ImportText(Env, + new ImportTextData.Input { InputFile = inputFileTrain, CustomSchema = schema }).Data.Take(numOfSampleRows); + var inputFileTest = new SimpleFileHandle(Env, pathDataTest, false, false); + var datasetTest = ImportTextData.ImportText(Env, + new ImportTextData.Input { InputFile = inputFileTest, CustomSchema = schema }).Data.Take(numOfSampleRows); +#pragma warning restore 0618 + const int batchSize = 5; + const int numIterations = 20; + const int numTransformLevels = 2; + AutoInference.SupportedMetric metric = AutoInference.SupportedMetric.Auc; + + // Using the simple, uniform random sampling (with replacement) engine + PipelineOptimizerBase autoMlEngine = new UniformRandomEngine(Env); + + // Create search object + var amls = new AutoInference.AutoMlMlState(Env, metric, autoMlEngine, new IterationTerminator(numIterations), + MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer, datasetTrain, datasetTest); + + // Infer search space + amls.InferSearchSpace(numTransformLevels); + + // Create macro object + var pipelineSweepInput = new Microsoft.ML.Models.PipelineSweeper() + { + BatchSize = batchSize, + }; + + var exp = new Experiment(Env); + var output = exp.Add(pipelineSweepInput); + exp.Compile(); + exp.SetInput(pipelineSweepInput.TrainingData, datasetTrain); + exp.SetInput(pipelineSweepInput.TestingData, datasetTest); + exp.SetInput(pipelineSweepInput.State, amls); + exp.SetInput(pipelineSweepInput.CandidateOutputs, new IDataView[0]); + exp.Run(); + + // Make sure you get back an AutoMlState, and that it ran for correct number of iterations + // with at least minimal performance values (i.e., best should have AUC better than 0.1 on this dataset). + AutoInference.AutoMlMlState amlsOut = (AutoInference.AutoMlMlState)exp.GetOutput(output.State); + Assert.NotNull(amlsOut); + Assert.Equal(amlsOut.GetAllEvaluatedPipelines().Length, numIterations); + Assert.True(amlsOut.GetBestPipeline().PerformanceSummary.MetricValue > 0.1); + } + + [Fact] [TestCategory("EntryPoints")] public void EntryPointPipelineSweepSerialization() { // Get datasets - var pathData = GetDataPath(@"../UCI/adult.train"); - var pathDataTest = GetDataPath(@"../UCI/adult.test"); + var pathData = GetDataPath(@"../../Samples/UCI/adult.train"); + var pathDataTest = GetDataPath(@"../../Samples/UCI/adult.test"); const int numOfSampleRows = 1000; int numIterations = 10; const string schema = "sep=, col=Features:R4:0,2,4,10-12 col=workclass:TX:1 col=education:TX:3 col=marital_status:TX:5 col=occupation:TX:6 " + "col=relationship:TX:7 col=ethnicity:TX:8 col=sex:TX:9 col=native_country:TX:13 col=label_IsOver50K_:R4:14 header=+"; var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false); +#pragma warning disable 0618 var datasetTrain = ImportTextData.ImportText(Env, new ImportTextData.Input { InputFile = inputFileTrain, CustomSchema = schema }).Data.Take(numOfSampleRows); var inputFileTest = new SimpleFileHandle(Env, pathDataTest, false, false); var datasetTest = ImportTextData.ImportText(Env, new ImportTextData.Input { InputFile = inputFileTest, CustomSchema = schema }).Data.Take(numOfSampleRows); +#pragma warning restore 0618 // Define entrypoint graph string inputGraph = @" { 'Nodes': [ { - 'Name': 'Commands.PipelineSweep', + 'Name': 'Models.PipelineSweeper', 'Inputs': { 'TrainingData': '$TrainingData', 'TestingData': '$TestingData', @@ -130,7 +191,8 @@ public void EntryPointPipelineSweepSerialization() var results = runner.GetOutput("ResultsOut"); Assert.NotNull(results); - var rows = PipelinePattern.ExtractResults(Env, results, "Graph", "MetricValue", "PipelineId"); + var rows = PipelinePattern.ExtractResults(Env, results, + "Graph", "MetricValue", "PipelineId", "TrainingMetricValue", "FirstInput", "PredictorModel"); Assert.True(rows.Length == numIterations); } @@ -143,12 +205,13 @@ public void EntryPointPipelineSweep() const int numOfSampleRows = 1000; int numIterations = 4; var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false); +#pragma warning disable 0618 var datasetTrain = ImportTextData.ImportText(Env, new ImportTextData.Input { InputFile = inputFileTrain }).Data.Take(numOfSampleRows); var inputFileTest = new SimpleFileHandle(Env, pathDataTest, false, false); var datasetTest = ImportTextData.ImportText(Env, new ImportTextData.Input { InputFile = inputFileTest }).Data.Take(numOfSampleRows); - +#pragma warning restore 0618 // Define entrypoint graph string inputGraph = @" { @@ -201,355 +264,344 @@ public void EntryPointPipelineSweep() var results = runner.GetOutput("ResultsOut"); Assert.NotNull(results); - var rows = PipelinePattern.ExtractResults(Env, results, "Graph", "MetricValue", "PipelineId"); + var rows = PipelinePattern.ExtractResults(Env, results, + "Graph", "MetricValue", "PipelineId", "TrainingMetricValue", "FirstInput", "PredictorModel"); Assert.True(rows.Length == numIterations); + Assert.True(rows.All(r => r.TrainingMetricValue > 0.1)); } - [Fact(Skip = "Datasets Not Present")] + [Fact] public void TestRocketPipelineEngine() { - //// Get datasets - //var pathData = GetDataPath(@"../UCI", "adult.train"); - //var pathDataTest = GetDataPath(@"../UCI", "adult.test"); - //const int numOfSampleRows = 1000; - //int numIterations = 35; - //const string schema = - //"sep=, col=Features:R4:0,2,4,10-12 col=workclass:TX:1 col=education:TX:3 col=marital_status:TX:5 col=occupation:TX:6 " + - //"col=relationship:TX:7 col=ethnicity:TX:8 col=sex:TX:9 col=native_country:TX:13 col=label_IsOver50K_:R4:14 header=+"; - //var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false); - //var datasetTrain = ImportTextData.ImportText(Env, - //new ImportTextData.Input { InputFile = inputFileTrain, CustomSchema = schema }).Data.Take(numOfSampleRows); - //var inputFileTest = new SimpleFileHandle(Env, pathDataTest, false, false); - //var datasetTest = ImportTextData.ImportText(Env, - //new ImportTextData.Input { InputFile = inputFileTest, CustomSchema = schema }).Data.Take(numOfSampleRows); - - //// Define entrypoint graph - //string inputGraph = @" - //{ - //'Nodes': [ - //{ - //'Name': 'Commands.PipelineSweep', - //'Inputs': { - //'TrainingData': '$TrainingData', - //'TestingData': '$TestingData', - //'StateArguments': { - //'Name': 'AutoMlState', - //'Settings': { - //'Metric': 'Auc', - //'Engine': { - //'Name': 'Rocket', - //'Settings' : { - //'TopKLearners' : 2, - //'SecondRoundTrialsPerLearner' : 5 - //}, - //}, - //'TerminatorArgs': { - //'Name': 'IterationLimited', - //'Settings': { - //'FinalHistoryLength': 35 - //} - //}, - //'TrainerKind': 'SignatureBinaryClassifierTrainer' - //} - //}, - //'BatchSize': 5 - //}, - //'Outputs': { - //'State': '$StateOut', - //'Results': '$ResultsOut' - //} - //}, - //] - //}"; - - //JObject graph = JObject.Parse(inputGraph); - //var catalog = ModuleCatalog.CreateInstance(Env); - - //var runner = new GraphRunner(Env, catalog, graph[FieldNames.Nodes] as JArray); - //runner.SetInput("TrainingData", datasetTrain); - //runner.SetInput("TestingData", datasetTest); - //runner.RunAll(); - - //var autoMlState = runner.GetOutput("StateOut"); - //Assert.IsNotNull(autoMlState); - //var allPipelines = autoMlState.GetAllEvaluatedPipelines(); - //var bestPipeline = autoMlState.GetBestPipeline(); - //Assert.AreEqual(allPipelines.Length, numIterations); - //Assert.IsTrue(bestPipeline.PerformanceSummary.MetricValue > 0.1); - - //var results = runner.GetOutput("ResultsOut"); - //Assert.IsNotNull(results); - //var rows = PipelinePattern.ExtractResults(Env, results, "Graph", "MetricValue", "PipelineId"); - //Assert.IsTrue(rows.Length == numIterations); + // Get datasets + var pathData = GetDataPath(@"../../Samples/UCI", "adult.train"); + var pathDataTest = GetDataPath(@"../../Samples/UCI", "adult.test"); + const int numOfSampleRows = 1000; + int numIterations = 35; + const string schema = + "sep=, col=Features:R4:0,2,4,10-12 col=workclass:TX:1 col=education:TX:3 col=marital_status:TX:5 col=occupation:TX:6 " + + "col=relationship:TX:7 col=ethnicity:TX:8 col=sex:TX:9 col=native_country:TX:13 col=label_IsOver50K_:R4:14 header=+"; + var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false); +#pragma warning disable 0618 + var datasetTrain = ImportTextData.ImportText(Env, + new ImportTextData.Input { InputFile = inputFileTrain, CustomSchema = schema }).Data.Take(numOfSampleRows); + var inputFileTest = new SimpleFileHandle(Env, pathDataTest, false, false); + var datasetTest = ImportTextData.ImportText(Env, + new ImportTextData.Input { InputFile = inputFileTest, CustomSchema = schema }).Data.Take(numOfSampleRows); +#pragma warning restore 0618 + // Define entrypoint graph + string inputGraph = @" + { + 'Nodes': [ + { + 'Name': 'Models.PipelineSweeper', + 'Inputs': { + 'TrainingData': '$TrainingData', + 'TestingData': '$TestingData', + 'StateArguments': { + 'Name': 'AutoMlState', + 'Settings': { + 'Metric': 'Auc', + 'Engine': { + 'Name': 'Rocket', + 'Settings' : { + 'TopKLearners' : 2, + 'SecondRoundTrialsPerLearner' : 5 + }, + }, + 'TerminatorArgs': { + 'Name': 'IterationLimited', + 'Settings': { + 'FinalHistoryLength': 35 + } + }, + 'TrainerKind': 'SignatureBinaryClassifierTrainer' + } + }, + 'BatchSize': 5 + }, + 'Outputs': { + 'State': '$StateOut', + 'Results': '$ResultsOut' + } + }, + ] + }"; + + JObject graph = JObject.Parse(inputGraph); + var catalog = ModuleCatalog.CreateInstance(Env); + + var runner = new GraphRunner(Env, catalog, graph[FieldNames.Nodes] as JArray); + runner.SetInput("TrainingData", datasetTrain); + runner.SetInput("TestingData", datasetTest); + runner.RunAll(); + + var autoMlState = runner.GetOutput("StateOut"); + Assert.NotNull(autoMlState); + var allPipelines = autoMlState.GetAllEvaluatedPipelines(); + var bestPipeline = autoMlState.GetBestPipeline(); + Assert.Equal(allPipelines.Length, numIterations); + Assert.True(bestPipeline.PerformanceSummary.MetricValue > 0.1); + + var results = runner.GetOutput("ResultsOut"); + Assert.NotNull(results); + var rows = PipelinePattern.ExtractResults(Env, results, + "Graph", "MetricValue", "PipelineId", "TrainingMetricValue", "FirstInput", "PredictorModel"); + Assert.True(rows.Length == numIterations); } [Fact(Skip = "Need CoreTLC specific baseline update")] public void TestTextDatasetLearn() { - //using (var env = new TlcEnvironment()) - //{ - //string pathData = GetDataPath(@"../UnitTest/tweets_labeled_10k_test_validation.tsv"); - //int batchSize = 5; - //int numIterations = 35; - //int numTransformLevels = 1; - //int numSampleRows = 100; - //AutoInference.SupportedMetric metric = AutoInference.SupportedMetric.AccuracyMicro; - - //// Using the simple, uniform random sampling (with replacement) engine - //PipelineOptimizerBase autoMlEngine = new UniformRandomEngine(env); - - //// Test initial learning - //var amls = AutoInference.InferPipelines(env, autoMlEngine, pathData, "", out var _, numTransformLevels, batchSize, - //metric, out var _, numSampleRows, new IterationTerminator(numIterations), - //MacroUtils.TrainerKinds.SignatureMultiClassClassifierTrainer); - //env.Check(amls.GetAllEvaluatedPipelines().Length == numIterations); - //} - //Done(); - } + using (var env = new TlcEnvironment()) + { + string pathData = GetDataPath(@"../UnitTest/tweets_labeled_10k_test_validation.tsv"); + int batchSize = 5; + int numIterations = 35; + int numTransformLevels = 1; + int numSampleRows = 100; + AutoInference.SupportedMetric metric = AutoInference.SupportedMetric.AccuracyMicro; - [Fact(Skip = "Need CoreTLC specific baseline update")] - public void TestPipelineNodeCloning() - { - //using (var env = new TlcEnvironment()) - //{ - //var lr1 = RecipeInference - //.AllowedLearners(env, MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer) - //.First(learner => learner.PipelineNode != null && learner.LearnerName.Contains("LogisticRegression")); - - //var sdca1 = RecipeInference - //.AllowedLearners(env, MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer) - //.First(learner => learner.PipelineNode != null && learner.LearnerName.Contains("Sdca")); - - //// Clone and change hyperparam values - //var lr2 = lr1.Clone(); - //lr1.PipelineNode.SweepParams[0].RawValue = 1.2f; - //lr2.PipelineNode.SweepParams[0].RawValue = 3.5f; - //var sdca2 = sdca1.Clone(); - //sdca1.PipelineNode.SweepParams[0].RawValue = 3; - //sdca2.PipelineNode.SweepParams[0].RawValue = 0; - - //// Make sure the changes are propagated to entry point objects - //env.Check(lr1.PipelineNode.UpdateProperties()); - //env.Check(lr2.PipelineNode.UpdateProperties()); - //env.Check(sdca1.PipelineNode.UpdateProperties()); - //env.Check(sdca2.PipelineNode.UpdateProperties()); - //env.Check(lr1.PipelineNode.CheckEntryPointStateMatchesParamValues()); - //env.Check(lr2.PipelineNode.CheckEntryPointStateMatchesParamValues()); - //env.Check(sdca1.PipelineNode.CheckEntryPointStateMatchesParamValues()); - //env.Check(sdca2.PipelineNode.CheckEntryPointStateMatchesParamValues()); - - //// Make sure second object's set of changes didn't overwrite first object's - //env.Check(!lr1.PipelineNode.SweepParams[0].RawValue.Equals(lr2.PipelineNode.SweepParams[0].RawValue)); - //env.Check(!sdca2.PipelineNode.SweepParams[0].RawValue.Equals(sdca1.PipelineNode.SweepParams[0].RawValue)); - //} + // Using the simple, uniform random sampling (with replacement) engine + PipelineOptimizerBase autoMlEngine = new UniformRandomEngine(env); + + // Test initial learning + var amls = AutoInference.InferPipelines(env, autoMlEngine, pathData, "", out var _, numTransformLevels, batchSize, + metric, out var _, numSampleRows, new IterationTerminator(numIterations), + MacroUtils.TrainerKinds.SignatureMultiClassClassifierTrainer); + env.Check(amls.GetAllEvaluatedPipelines().Length == numIterations); + } + Done(); } - [Fact(Skip = "Need CoreTLC specific baseline update")] - public void TestSupportedMetricsByName() + [Fact] + public void TestPipelineNodeCloning() { - //var fields = - //typeof(AutoInference.SupportedMetric).GetMembers(BindingFlags.Static | BindingFlags.Public) - //.Where(s => s.MemberType == MemberTypes.Field); - //foreach (var field in fields) - //{ - //var metric = AutoInference.SupportedMetric.ByName(field.Name); - //Assert.IsTrue(metric?.Name == field.Name); - //} - + using (var env = new TlcEnvironment()) + { + var lr1 = RecipeInference + .AllowedLearners(env, MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer) + .First(learner => learner.PipelineNode != null && learner.LearnerName.Contains("LogisticRegression")); + + var sdca1 = RecipeInference + .AllowedLearners(env, MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer) + .First(learner => learner.PipelineNode != null && learner.LearnerName.Contains("StochasticDualCoordinateAscent")); + + // Clone and change hyperparam values + var lr2 = lr1.Clone(); + lr1.PipelineNode.SweepParams[0].RawValue = 1.2f; + lr2.PipelineNode.SweepParams[0].RawValue = 3.5f; + var sdca2 = sdca1.Clone(); + sdca1.PipelineNode.SweepParams[0].RawValue = 3; + sdca2.PipelineNode.SweepParams[0].RawValue = 0; + + // Make sure the changes are propagated to entry point objects + env.Check(lr1.PipelineNode.UpdateProperties()); + env.Check(lr2.PipelineNode.UpdateProperties()); + env.Check(sdca1.PipelineNode.UpdateProperties()); + env.Check(sdca2.PipelineNode.UpdateProperties()); + env.Check(lr1.PipelineNode.CheckEntryPointStateMatchesParamValues()); + env.Check(lr2.PipelineNode.CheckEntryPointStateMatchesParamValues()); + env.Check(sdca1.PipelineNode.CheckEntryPointStateMatchesParamValues()); + env.Check(sdca2.PipelineNode.CheckEntryPointStateMatchesParamValues()); + + // Make sure second object's set of changes didn't overwrite first object's + env.Check(!lr1.PipelineNode.SweepParams[0].RawValue.Equals(lr2.PipelineNode.SweepParams[0].RawValue)); + env.Check(!sdca2.PipelineNode.SweepParams[0].RawValue.Equals(sdca1.PipelineNode.SweepParams[0].RawValue)); + } } - [Fact(Skip = "Need CoreTLC specific baseline update")] + [Fact] public void TestHyperparameterFreezing() { - //string pathData = GetDataPath(@"../UCI", "adult.train"); - //int numOfSampleRows = 1000; - //int batchSize = 1; - //int numIterations = 10; - //int numTransformLevels = 3; - //AutoInference.SupportedMetric metric = AutoInference.SupportedMetric.Auc; - - //// Using the simple, uniform random sampling (with replacement) brain - //PipelineOptimizerBase autoMlBrain = new UniformRandomEngine(Env); - - //// Run initial experiments - //var amls = AutoInference.InferPipelines(Env, autoMlBrain, pathData, "", out var _, numTransformLevels, batchSize, - //metric, out var bestPipeline, numOfSampleRows, new IterationTerminator(numIterations), - //MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer); - - //// Clear results - //amls.ClearEvaluatedPipelines(); - - //// Get space, remove transforms and all but one learner, freeze hyperparameters on learner. - //var space = amls.GetSearchSpace(); - //var transforms = space.Item1.Where(t => - //t.ExpertType != typeof(TransformInference.Experts.Categorical)).ToArray(); - //var learners = new[] { space.Item2.First() }; - //var hyperParam = learners[0].PipelineNode.SweepParams.First(); - //var frozenParamValue = hyperParam.RawValue; - //hyperParam.Frozen = true; - //amls.UpdateSearchSpace(learners, transforms); - - //// Allow for one more iteration - //amls.UpdateTerminator(new IterationTerminator(numIterations + 1)); - - //// Do learning. Only retained learner should be left in all pipelines. - //bestPipeline = amls.InferPipelines(numTransformLevels, batchSize, numOfSampleRows); - - //// Make sure all pipelines have retained learner - //Assert.IsTrue(amls.GetAllEvaluatedPipelines().All(p => p.Learner.LearnerName == learners[0].LearnerName)); - - //// Make sure hyperparameter value did not change - //Assert.IsNotNull(bestPipeline); - //Assert.AreEqual(bestPipeline.Learner.PipelineNode.SweepParams.First().RawValue, frozenParamValue); + string pathData = GetDataPath(@"../../Samples/UCI", "adult.train"); + int numOfSampleRows = 1000; + int batchSize = 1; + int numIterations = 10; + int numTransformLevels = 3; + AutoInference.SupportedMetric metric = AutoInference.SupportedMetric.Auc; + + // Using the simple, uniform random sampling (with replacement) brain + PipelineOptimizerBase autoMlBrain = new UniformRandomEngine(Env); + + // Run initial experiments + var amls = AutoInference.InferPipelines(Env, autoMlBrain, pathData, "", out var _, numTransformLevels, batchSize, + metric, out var bestPipeline, numOfSampleRows, new IterationTerminator(numIterations), + MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer); + + // Clear results + amls.ClearEvaluatedPipelines(); + + // Get space, remove transforms and all but one learner, freeze hyperparameters on learner. + var space = amls.GetSearchSpace(); + var transforms = space.Item1.Where(t => + t.ExpertType != typeof(TransformInference.Experts.Categorical)).ToArray(); + var learners = new[] { space.Item2.First() }; + var hyperParam = learners[0].PipelineNode.SweepParams.First(); + var frozenParamValue = hyperParam.RawValue; + hyperParam.Frozen = true; + amls.UpdateSearchSpace(learners, transforms); + + // Allow for one more iteration + amls.UpdateTerminator(new IterationTerminator(numIterations + 1)); + + // Do learning. Only retained learner should be left in all pipelines. + bestPipeline = amls.InferPipelines(numTransformLevels, batchSize, numOfSampleRows); + + // Make sure all pipelines have retained learner + Assert.True(amls.GetAllEvaluatedPipelines().All(p => p.Learner.LearnerName == learners[0].LearnerName)); + + // Make sure hyperparameter value did not change + Assert.NotNull(bestPipeline); + Assert.Equal(bestPipeline.Learner.PipelineNode.SweepParams.First().RawValue, frozenParamValue); } - [Fact(Skip = "Need CoreTLC specific baseline update")] + [Fact(Skip = "Dataset not available.")] public void TestRegressionPipelineWithMinimizingMetric() { - //string pathData = GetDataPath("../Housing (regression)/housing.txt"); - //int numOfSampleRows = 100; - //int batchSize = 5; - //int numIterations = 10; - //int numTransformLevels = 1; - //AutoInference.SupportedMetric metric = AutoInference.SupportedMetric.L1; - - //// Using the simple, uniform random sampling (with replacement) brain - //PipelineOptimizerBase autoMlBrain = new UniformRandomEngine(Env); - - //// Run initial experiments - //var amls = AutoInference.InferPipelines(Env, autoMlBrain, pathData, "", out var _, numTransformLevels, batchSize, - //metric, out var bestPipeline, numOfSampleRows, new IterationTerminator(numIterations), - //MacroUtils.TrainerKinds.SignatureRegressorTrainer); - - //// Allow for one more iteration - //amls.UpdateTerminator(new IterationTerminator(numIterations + 1)); - - //// Do learning. Only retained learner should be left in all pipelines. - //bestPipeline = amls.InferPipelines(numTransformLevels, batchSize, numOfSampleRows); - - //// Make sure hyperparameter value did not change - //Assert.IsNotNull(bestPipeline); - //Assert.IsTrue(amls.GetAllEvaluatedPipelines().All( - //p => p.PerformanceSummary.MetricValue >= bestPipeline.PerformanceSummary.MetricValue)); + string pathData = GetDataPath("../Housing (regression)/housing.txt"); + int numOfSampleRows = 100; + int batchSize = 5; + int numIterations = 10; + int numTransformLevels = 1; + AutoInference.SupportedMetric metric = AutoInference.SupportedMetric.L1; + + // Using the simple, uniform random sampling (with replacement) brain + PipelineOptimizerBase autoMlBrain = new UniformRandomEngine(Env); + + // Run initial experiments + var amls = AutoInference.InferPipelines(Env, autoMlBrain, pathData, "", out var _, numTransformLevels, batchSize, + metric, out var bestPipeline, numOfSampleRows, new IterationTerminator(numIterations), + MacroUtils.TrainerKinds.SignatureRegressorTrainer); + + // Allow for one more iteration + amls.UpdateTerminator(new IterationTerminator(numIterations + 1)); + + // Do learning. Only retained learner should be left in all pipelines. + bestPipeline = amls.InferPipelines(numTransformLevels, batchSize, numOfSampleRows); + + // Make sure hyperparameter value did not change + Assert.NotNull(bestPipeline); + Assert.True(amls.GetAllEvaluatedPipelines().All( + p => p.PerformanceSummary.MetricValue >= bestPipeline.PerformanceSummary.MetricValue)); } - [Fact(Skip = "Need CoreTLC specific baseline update")] + [Fact] public void TestLearnerConstrainingByName() { - //string pathData = GetDataPath(@"../UCI", "adult.train"); - //int numOfSampleRows = 1000; - //int batchSize = 1; - //int numIterations = 1; - //int numTransformLevels = 2; - //var prefix = "Microsoft.ML.Api.Experiment"; - //var retainedLearnerNames = new[] { $"{prefix}.LogisticRegression", $"{prefix}.FastTree" }; - //AutoInference.SupportedMetric metric = AutoInference.SupportedMetric.Auc; - - //// Using the simple, uniform random sampling (with replacement) brain. - //PipelineOptimizerBase autoMlBrain = new UniformRandomEngine(Env); - - //// Run initial experiment. - //var amls = AutoInference.InferPipelines(Env, autoMlBrain, pathData, "", out var _, - //numTransformLevels, batchSize, metric, out var _, numOfSampleRows, - //new IterationTerminator(numIterations), MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer); - - //// Keep only logistic regression and FastTree. - //amls.KeepSelectedLearners(retainedLearnerNames); - //var space = amls.GetSearchSpace(); - - //// Make sure only learners left are those retained. - //Assert.AreEqual(retainedLearnerNames.Length, space.Item2.Length); - //Assert.IsTrue(space.Item2.All(l => retainedLearnerNames.Any(r => r == l.LearnerName))); + string pathData = GetDataPath(@"../../Samples/UCI", "adult.train"); + int numOfSampleRows = 1000; + int batchSize = 1; + int numIterations = 1; + int numTransformLevels = 2; + var retainedLearnerNames = new[] { $"LogisticRegressionBinaryClassifier", $"FastTreeBinaryClassifier" }; + AutoInference.SupportedMetric metric = AutoInference.SupportedMetric.Auc; + + // Using the simple, uniform random sampling (with replacement) brain. + PipelineOptimizerBase autoMlBrain = new UniformRandomEngine(Env); + + // Run initial experiment. + var amls = AutoInference.InferPipelines(Env, autoMlBrain, pathData, "", out var _, + numTransformLevels, batchSize, metric, out var _, numOfSampleRows, + new IterationTerminator(numIterations), MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer); + + // Keep only logistic regression and FastTree. + amls.KeepSelectedLearners(retainedLearnerNames); + var space = amls.GetSearchSpace(); + + // Make sure only learners left are those retained. + Assert.Equal(retainedLearnerNames.Length, space.Item2.Length); + Assert.True(space.Item2.All(l => retainedLearnerNames.Any(r => r == l.LearnerName))); } - [Fact(Skip = "Need CoreTLC specific baseline update")] + [Fact] public void TestRequestedLearners() { - //// Get datasets - //var pathData = GetDataPath(@"../UCI", "adult.train"); - //var pathDataTest = GetDataPath(@"../UCI", "adult.test"); - //const int numOfSampleRows = 100; - //const string schema = - //"sep=, col=Features:R4:0,2,4,10-12 col=workclass:TX:1 col=education:TX:3 col=marital_status:TX:5 col=occupation:TX:6 " + - //"col=relationship:TX:7 col=ethnicity:TX:8 col=sex:TX:9 col=native_country:TX:13 col=label_IsOver50K_:R4:14 header=+"; - //var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false); - //var datasetTrain = ImportTextData.ImportText(Env, - //new ImportTextData.Input { InputFile = inputFileTrain, CustomSchema = schema }).Data.Take(numOfSampleRows); - //var inputFileTest = new SimpleFileHandle(Env, pathDataTest, false, false); - //var datasetTest = ImportTextData.ImportText(Env, - //new ImportTextData.Input { InputFile = inputFileTest, CustomSchema = schema }).Data.Take(numOfSampleRows); - //var prefix = "Microsoft.ML.Api.Experiment"; - //var requestedLearners = new[] { $"{prefix}.LogisticRegression", $"{prefix}.FastTree" }; - - //// Define entrypoint graph - //string inputGraph = @" - //{ - //'Nodes': [ - //{ - //'Name': 'Commands.PipelineSweep', - //'Inputs': { - //'TrainingData': '$TrainingData', - //'TestingData': '$TestingData', - //'StateArguments': { - //'Name': 'AutoMlState', - //'Settings': { - //'Metric': 'Auc', - //'Engine': { - //'Name': 'Rocket', - //'Settings' : { - //'TopKLearners' : 2, - //'SecondRoundTrialsPerLearner' : 0 - //}, - //}, - //'TerminatorArgs': { - //'Name': 'IterationLimited', - //'Settings': { - //'FinalHistoryLength': 35 - //} - //}, - //'TrainerKind': 'SignatureBinaryClassifierTrainer', - //'RequestedLearners' : [ - //'Microsoft.ML.Api.Experiment.LogisticRegression', - //'Microsoft.ML.Api.Experiment.FastTree' - //] - //} - //}, - //'BatchSize': 5 - //}, - //'Outputs': { - //'State': '$StateOut', - //'Results': '$ResultsOut' - //} - //}, - //] - //}"; - - //JObject graph = JObject.Parse(inputGraph); - //var catalog = ModuleCatalog.CreateInstance(Env); - - //var runner = new GraphRunner(Env, catalog, graph[FieldNames.Nodes] as JArray); - //runner.SetInput("TrainingData", datasetTrain); - //runner.SetInput("TestingData", datasetTest); - //runner.RunAll(); - - //var autoMlState = runner.GetOutput("StateOut"); - //Assert.IsNotNull(autoMlState); - //var space = autoMlState.GetSearchSpace(); - - //// Make sure only learners left are those retained. - //Assert.AreEqual(requestedLearners.Length, space.Item2.Length); - //Assert.IsTrue(space.Item2.All(l => requestedLearners.Any(r => r == l.LearnerName))); + // Get datasets + var pathData = GetDataPath(@"../../Samples/UCI", "adult.train"); + var pathDataTest = GetDataPath(@"../../Samples/UCI", "adult.test"); + const int numOfSampleRows = 100; + const string schema = + "sep=, col=Features:R4:0,2,4,10-12 col=workclass:TX:1 col=education:TX:3 col=marital_status:TX:5 col=occupation:TX:6 " + + "col=relationship:TX:7 col=race:TX:8 col=sex:TX:9 col=native_country:TX:13 col=label_IsOver50K_:R4:14 header=+"; + var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false); +#pragma warning disable 0618 + var datasetTrain = ImportTextData.ImportText(Env, + new ImportTextData.Input { InputFile = inputFileTrain, CustomSchema = schema }).Data.Take(numOfSampleRows); + var inputFileTest = new SimpleFileHandle(Env, pathDataTest, false, false); + var datasetTest = ImportTextData.ImportText(Env, + new ImportTextData.Input { InputFile = inputFileTest, CustomSchema = schema }).Data.Take(numOfSampleRows); + var requestedLearners = new[] { $"LogisticRegressionBinaryClassifier", $"FastTreeBinaryClassifier" }; +#pragma warning restore 0618 + // Define entrypoint graph + string inputGraph = @" + { + 'Nodes': [ + { + 'Name': 'Models.PipelineSweeper', + 'Inputs': { + 'TrainingData': '$TrainingData', + 'TestingData': '$TestingData', + 'StateArguments': { + 'Name': 'AutoMlState', + 'Settings': { + 'Metric': 'Auc', + 'Engine': { + 'Name': 'Rocket', + 'Settings' : { + 'TopKLearners' : 2, + 'SecondRoundTrialsPerLearner' : 0 + }, + }, + 'TerminatorArgs': { + 'Name': 'IterationLimited', + 'Settings': { + 'FinalHistoryLength': 35 + } + }, + 'TrainerKind': 'SignatureBinaryClassifierTrainer', + 'RequestedLearners' : [ + 'LogisticRegressionBinaryClassifier', + 'FastTreeBinaryClassifier' + ] + } + }, + 'BatchSize': 5 + }, + 'Outputs': { + 'State': '$StateOut', + 'Results': '$ResultsOut' + } + }, + ] + }"; + + JObject graph = JObject.Parse(inputGraph); + var catalog = ModuleCatalog.CreateInstance(Env); + + var runner = new GraphRunner(Env, catalog, graph[FieldNames.Nodes] as JArray); + runner.SetInput("TrainingData", datasetTrain); + runner.SetInput("TestingData", datasetTest); + runner.RunAll(); + + var autoMlState = runner.GetOutput("StateOut"); + Assert.NotNull(autoMlState); + var space = autoMlState.GetSearchSpace(); + + // Make sure only learners left are those retained. + Assert.Equal(requestedLearners.Length, space.Item2.Length); + Assert.True(space.Item2.All(l => requestedLearners.Any(r => r == l.LearnerName))); } - [Fact(Skip = "Need CoreTLC specific baseline update")] + [Fact] public void TestMinimizingMetricTransformations() { - //var values = new[] { 100d, 10d, -2d, -1d, 5.8d, -3.1d }; - //var maxWeight = values.Max(); - //var processed = values.Select(v => AutoMlUtils.ProcessWeight(v, maxWeight, false)); - //var expectedResult = new[] { 0d, 90d, 102d, 101d, 94.2d, 103.1d }; + var values = new[] { 100d, 10d, -2d, -1d, 5.8d, -3.1d }; + var maxWeight = values.Max(); + var processed = values.Select(v => AutoMlUtils.ProcessWeight(v, maxWeight, false)); + var expectedResult = new[] { 0d, 90d, 102d, 101d, 94.2d, 103.1d }; - //Assert.IsTrue(processed.Select((x, idx) => Math.Abs(x - expectedResult[idx]) < 0.001).All(r => r)); + Assert.True(processed.Select((x, idx) => System.Math.Abs(x - expectedResult[idx]) < 0.001).All(r => r)); } } } diff --git a/test/Microsoft.ML.TestFramework/ModelHelper.cs b/test/Microsoft.ML.TestFramework/ModelHelper.cs index dca360c4e3..1b0ab4eb8e 100644 --- a/test/Microsoft.ML.TestFramework/ModelHelper.cs +++ b/test/Microsoft.ML.TestFramework/ModelHelper.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using Microsoft.ML.Data; using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.CommandLine; using Microsoft.ML.Runtime.Data; @@ -40,24 +41,187 @@ public static void WriteKcHousePriceModel(string dataPath, Stream stream) public static IDataView GetKcHouseDataView(string dataPath) { - var dataSchema = "col=Id:TX:0 col=Date:TX:1 col=Label:R4:2 col=Bedrooms:R4:3 col=Bathrooms:R4:4 col=SqftLiving:R4:5 col=SqftLot:R4:6 col=Floors:R4:7 col=Waterfront:R4:8 col=View:R4:9 col=Condition:R4:10 col=Grade:R4:11 col=SqftAbove:R4:12 col=SqftBasement:R4:13 col=YearBuilt:R4:14 col=YearRenovated:R4:15 col=Zipcode:R4:16 col=Lat:R4:17 col=Long:R4:18 col=SqftLiving15:R4:19 col=SqftLot15:R4:20 header+ sep=,"; - var txtArgs = new TextLoader.Arguments(); + var dataSchema = "col=Id:TX:0 col=Date:TX:1 col=Label:R4:2 col=Bedrooms:R4:3 " + + "col=Bathrooms:R4:4 col=SqftLiving:R4:5 col=SqftLot:R4:6 col=Floors:R4:7 " + + "col=Waterfront:R4:8 col=View:R4:9 col=Condition:R4:10 col=Grade:R4:11 " + + "col=SqftAbove:R4:12 col=SqftBasement:R4:13 col=YearBuilt:R4:14 " + + "col=YearRenovated:R4:15 col=Zipcode:R4:16 col=Lat:R4:17 col=Long:R4:18 " + + "col=SqftLiving15:R4:19 col=SqftLot15:R4:20 header+ sep=,"; + + var txtArgs = new Runtime.Data.TextLoader.Arguments(); bool parsed = CmdParser.ParseArguments(s_environment, dataSchema, txtArgs); s_environment.Assert(parsed); - var txtLoader = new TextLoader(s_environment, txtArgs, new MultiFileSource(dataPath)); + var txtLoader = new Runtime.Data.TextLoader(s_environment, txtArgs, new MultiFileSource(dataPath)); return txtLoader; } private static ITransformModel CreateKcHousePricePredictorModel(string dataPath) { - var dataSchema = "col=Id:TX:0 col=Date:TX:1 col=Label:R4:2 col=Bedrooms:R4:3 col=Bathrooms:R4:4 col=SqftLiving:R4:5 col=SqftLot:R4:6 col=Floors:R4:7 col=Waterfront:R4:8 col=View:R4:9 col=Condition:R4:10 col=Grade:R4:11 col=SqftAbove:R4:12 col=SqftBasement:R4:13 col=YearBuilt:R4:14 col=YearRenovated:R4:15 col=Zipcode:R4:16 col=Lat:R4:17 col=Long:R4:18 col=SqftLiving15:R4:19 col=SqftLot15:R4:20 header+ sep=,"; - Experiment experiment = s_environment.CreateExperiment(); - var importData = new Data.TextLoader(); - importData.CustomSchema = dataSchema; - Data.TextLoader.Output imported = experiment.Add(importData); + var importData = new Data.TextLoader(dataPath) + { + Arguments = new TextLoaderArguments + { + Separator = new[] { ',' }, + HasHeader = true, + Column = new[] + { + new TextLoaderColumn() + { + Name = "Id", + Source = new [] { new TextLoaderRange(0) }, + Type = Runtime.Data.DataKind.Text + }, + + new TextLoaderColumn() + { + Name = "Date", + Source = new [] { new TextLoaderRange(1) }, + Type = Runtime.Data.DataKind.Text + }, + + new TextLoaderColumn() + { + Name = "Label", + Source = new [] { new TextLoaderRange(2) }, + Type = Runtime.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "Bedrooms", + Source = new [] { new TextLoaderRange(3) }, + Type = Runtime.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "Bathrooms", + Source = new [] { new TextLoaderRange(4) }, + Type = Runtime.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "SqftLiving", + Source = new [] { new TextLoaderRange(5) }, + Type = Runtime.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "SqftLot", + Source = new [] { new TextLoaderRange(6) }, + Type = Runtime.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "Floors", + Source = new [] { new TextLoaderRange(7) }, + Type = Runtime.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "Waterfront", + Source = new [] { new TextLoaderRange(8) }, + Type = Runtime.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "View", + Source = new [] { new TextLoaderRange(9) }, + Type = Runtime.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "Condition", + Source = new [] { new TextLoaderRange(10) }, + Type = Runtime.Data.DataKind.Num + }, + new TextLoaderColumn() + { + Name = "Grade", + Source = new [] { new TextLoaderRange(11) }, + Type = Runtime.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "SqftAbove", + Source = new [] { new TextLoaderRange(12) }, + Type = Runtime.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "SqftBasement", + Source = new [] { new TextLoaderRange(13) }, + Type = Runtime.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "YearBuilt", + Source = new [] { new TextLoaderRange(14) }, + Type = Runtime.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "YearRenovated", + Source = new [] { new TextLoaderRange(15) }, + Type = Runtime.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "Zipcode", + Source = new [] { new TextLoaderRange(16) }, + Type = Runtime.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "Lat", + Source = new [] { new TextLoaderRange(17) }, + Type = Runtime.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "Long", + Source = new [] { new TextLoaderRange(18) }, + Type = Runtime.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "SqftLiving15", + Source = new [] { new TextLoaderRange(19) }, + Type = Runtime.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "SqftLot15", + Source = new [] { new TextLoaderRange(20) }, + Type = Runtime.Data.DataKind.Num + }, + } + } + + //new Data.CustomTextLoader(); + // importData.CustomSchema = dataSchema; + // + }; + + Data.TextLoader.Output imported = experiment.Add(importData); var numericalConcatenate = new Transforms.ColumnConcatenator(); numericalConcatenate.Data = imported.Data; numericalConcatenate.AddColumn("NumericalFeatures", "SqftLiving", "SqftLot", "SqftAbove", "SqftBasement", "Lat", "Long", "SqftLiving15", "SqftLot15"); diff --git a/test/Microsoft.ML.Tests/LearningPipelineTests.cs b/test/Microsoft.ML.Tests/LearningPipelineTests.cs index 4519fc5285..3ccc36255f 100644 --- a/test/Microsoft.ML.Tests/LearningPipelineTests.cs +++ b/test/Microsoft.ML.Tests/LearningPipelineTests.cs @@ -5,6 +5,7 @@ using Microsoft.ML; using Microsoft.ML.Data; using Microsoft.ML.Runtime.Api; +using Microsoft.ML.Runtime.Data; using Microsoft.ML.TestFramework; using Microsoft.ML.Trainers; using Microsoft.ML.Transforms; @@ -66,7 +67,7 @@ public void TransformOnlyPipeline() { const string _dataPath = @"..\..\Data\breast-cancer.txt"; var pipeline = new LearningPipeline(); - pipeline.Add(new TextLoader(_dataPath, useHeader: false)); + pipeline.Add(new ML.Data.TextLoader(_dataPath).CreateFrom(useHeader: false)); pipeline.Add(new CategoricalHashOneHotVectorizer("F1") { HashBits = 10, Seed = 314489979, OutputKind = CategoricalTransformOutputKind.Bag }); var model = pipeline.Train(); var predictionModel = model.Predict(new InputData() { F1 = "5" }); @@ -95,7 +96,7 @@ public class Data public class Prediction { [ColumnName("PredictedLabel")] - public bool PredictedLabel; + public DvBool PredictedLabel; } [Fact] diff --git a/test/Microsoft.ML.Tests/Scenarios/HousePriceTrainAndPredictionTests.cs b/test/Microsoft.ML.Tests/Scenarios/HousePriceTrainAndPredictionTests.cs index 38ec6ce073..31fc4fdd6d 100644 --- a/test/Microsoft.ML.Tests/Scenarios/HousePriceTrainAndPredictionTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/HousePriceTrainAndPredictionTests.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using Microsoft.ML.Data; using Microsoft.ML.Models; using Microsoft.ML.Runtime.Api; using Microsoft.ML.TestFramework; @@ -21,7 +22,7 @@ public void TrainAndPredictHousePriceModelTest() var pipeline = new LearningPipeline(); - pipeline.Add(new TextLoader(dataPath, useHeader: true, separator: ",")); + pipeline.Add(new TextLoader(dataPath).CreateFrom(useHeader: true, separator: ',')); pipeline.Add(new ColumnConcatenator(outputColumn: "NumericalFeatures", "SqftLiving", "SqftLot", "SqftAbove", "SqftBasement", "Lat", "Long", "SqftLiving15", "SqftLot15")); @@ -61,7 +62,7 @@ public void TrainAndPredictHousePriceModelTest() Assert.InRange(prediction.Price, 260_000, 330_000); string testDataPath = GetDataPath("kc_house_test.csv"); - var testData = new TextLoader(testDataPath, useHeader: true, separator: ","); + var testData = new TextLoader(testDataPath).CreateFrom(useHeader: true, separator: ','); var evaluator = new RegressionEvaluator(); RegressionMetrics metrics = evaluator.Evaluate(model, testData); diff --git a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationTests.cs b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationTests.cs index de7c602047..5dcbf3a588 100644 --- a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationTests.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using Microsoft.ML.Data; using Microsoft.ML.Models; using Microsoft.ML.Runtime.Api; using Microsoft.ML.Trainers; @@ -19,7 +20,7 @@ public void TrainAndPredictIrisModelTest() var pipeline = new LearningPipeline(); - pipeline.Add(new TextLoader(dataPath, useHeader: false, separator: "tab")); + pipeline.Add(new TextLoader(dataPath).CreateFrom(useHeader: false)); pipeline.Add(new ColumnConcatenator(outputColumn: "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth")); @@ -66,7 +67,7 @@ public void TrainAndPredictIrisModelTest() // Note: Testing against the same data set as a simple way to test evaluation. // This isn't appropriate in real-world scenarios. string testDataPath = GetDataPath("iris.txt"); - var testData = new TextLoader(testDataPath, useHeader: false, separator: "tab"); + var testData = new TextLoader(testDataPath).CreateFrom(useHeader: false); var evaluator = new ClassificationEvaluator(); evaluator.OutputTopKAcc = 3; diff --git a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs index 79cc2fc137..ebddc33b03 100644 --- a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using Microsoft.ML.Data; using Microsoft.ML.Models; using Microsoft.ML.Runtime.Api; using Microsoft.ML.Trainers; @@ -19,7 +20,7 @@ public void TrainAndPredictIrisModelWithStringLabelTest() var pipeline = new LearningPipeline(); - pipeline.Add(new TextLoader(dataPath, useHeader: false, separator: ",")); + pipeline.Add(new TextLoader(dataPath).CreateFrom(useHeader: false, separator: ',')); pipeline.Add(new Dictionarizer("Label")); // "IrisPlantType" is used as "Label" because of column attribute name on the field. @@ -69,7 +70,7 @@ public void TrainAndPredictIrisModelWithStringLabelTest() // Note: Testing against the same data set as a simple way to test evaluation. // This isn't appropriate in real-world scenarios. string testDataPath = GetDataPath("iris.data"); - var testData = new TextLoader(testDataPath, useHeader: false, separator: ","); + var testData = new TextLoader(testDataPath).CreateFrom(useHeader: false, separator: ','); var evaluator = new ClassificationEvaluator(); evaluator.OutputTopKAcc = 3; diff --git a/test/Microsoft.ML.Tests/Scenarios/SentimentPredictionTests.cs b/test/Microsoft.ML.Tests/Scenarios/SentimentPredictionTests.cs index 608cbef144..80947644e9 100644 --- a/test/Microsoft.ML.Tests/Scenarios/SentimentPredictionTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/SentimentPredictionTests.cs @@ -2,9 +2,11 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using Microsoft.ML.Data; using Microsoft.ML.Models; using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Api; +using Microsoft.ML.Runtime.Data; using Microsoft.ML.Trainers; using Microsoft.ML.Transforms; using System.Collections.Generic; @@ -23,7 +25,32 @@ public void TrainAndPredictSentimentModelTest() { string dataPath = GetDataPath(SentimentDataPath); var pipeline = new LearningPipeline(); - pipeline.Add(new TextLoader(dataPath, useHeader: true, separator: "tab")); + + pipeline.Add(new Data.TextLoader(dataPath) + { + Arguments = new TextLoaderArguments + { + Separator = new[] { '\t' }, + HasHeader = true, + Column = new[] + { + new TextLoaderColumn() + { + Name = "Label", + Source = new [] { new TextLoaderRange(0) }, + Type = Runtime.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "SentimentText", + Source = new [] { new TextLoaderRange(1) }, + Type = Runtime.Data.DataKind.Text + } + } + } + }); + pipeline.Add(new TextFeaturizer("Features", "SentimentText") { KeepDiacritics = false, @@ -56,12 +83,34 @@ public void TrainAndPredictSentimentModelTest() IEnumerable predictions = model.Predict(sentiments); Assert.Equal(2, predictions.Count()); - Assert.False(predictions.ElementAt(0).Sentiment); - Assert.True(predictions.ElementAt(1).Sentiment); + Assert.True(predictions.ElementAt(0).Sentiment.IsFalse); + Assert.True(predictions.ElementAt(1).Sentiment.IsTrue); string testDataPath = GetDataPath(SentimentTestPath); - var testData = new TextLoader(testDataPath, useHeader: true, separator: "tab"); - + var testData = new Data.TextLoader(testDataPath) + { + Arguments = new TextLoaderArguments + { + Separator = new[] { '\t' }, + HasHeader = true, + Column = new[] + { + new TextLoaderColumn() + { + Name = "Label", + Source = new [] { new TextLoaderRange(0) }, + Type = Runtime.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "SentimentText", + Source = new [] { new TextLoaderRange(1) }, + Type = Runtime.Data.DataKind.Text + } + } + } + }; var evaluator = new BinaryClassificationEvaluator(); BinaryClassificationMetrics metrics = evaluator.Evaluate(model, testData); @@ -105,7 +154,7 @@ public class SentimentData public class SentimentPrediction { [ColumnName("PredictedLabel")] - public bool Sentiment; + public DvBool Sentiment; } } } diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index 96075b625a..40c0b6525f 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -3,6 +3,7 @@ // See the LICENSE file in the project root for more information. using Microsoft.ML; +using Microsoft.ML.Data; using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Api; using Microsoft.ML.Runtime.Data; @@ -24,19 +25,19 @@ public TextLoaderTests(ITestOutputHelper output) [Fact] public void ConstructorDoesntThrow() { - Assert.NotNull(new TextLoader("fakeFile.txt")); - Assert.NotNull(new TextLoader("fakeFile.txt", useHeader: true)); - Assert.NotNull(new TextLoader("fakeFile.txt", separator: "tab")); - Assert.NotNull(new TextLoader("fakeFile.txt", useHeader: false, separator: "tab")); - Assert.NotNull(new TextLoader("fakeFile.txt", useHeader: false, separator: "tab", false, false)); - Assert.NotNull(new TextLoader("fakeFile.txt", useHeader: false, separator: "tab", supportSparse: false)); - Assert.NotNull(new TextLoader("fakeFile.txt", useHeader: false, separator: "tab", allowQuotedStrings: false)); + Assert.NotNull(new Data.TextLoader("fakeFile.txt").CreateFrom()); + Assert.NotNull(new Data.TextLoader("fakeFile.txt").CreateFrom(useHeader:true)); + Assert.NotNull(new Data.TextLoader("fakeFile.txt").CreateFrom()); + Assert.NotNull(new Data.TextLoader("fakeFile.txt").CreateFrom(useHeader: false)); + Assert.NotNull(new Data.TextLoader("fakeFile.txt").CreateFrom(useHeader: false, supportSparse: false, trimWhitespace: false)); + Assert.NotNull(new Data.TextLoader("fakeFile.txt").CreateFrom(useHeader: false, supportSparse: false)); + Assert.NotNull(new Data.TextLoader("fakeFile.txt").CreateFrom(useHeader: false, allowQuotedStrings: false)); } [Fact] public void CanSuccessfullyApplyATransform() { - var loader = new TextLoader("fakeFile.txt"); + var loader = new Data.TextLoader("fakeFile.txt").CreateFrom(); using (var environment = new TlcEnvironment()) { @@ -53,7 +54,7 @@ public void CanSuccessfullyApplyATransform() public void CanSuccessfullyRetrieveQuotedData() { string dataPath = GetDataPath("QuotingData.csv"); - var loader = new TextLoader(dataPath, useHeader: true, separator: ",", allowQuotedStrings: true, supportSparse: false); + var loader = new Data.TextLoader(dataPath).CreateFrom(useHeader: true, separator: ',', allowQuotedStrings: true, supportSparse: false); using (var environment = new TlcEnvironment()) { @@ -111,7 +112,7 @@ public void CanSuccessfullyRetrieveQuotedData() public void CanSuccessfullyRetrieveSparseData() { string dataPath = GetDataPath("SparseData.txt"); - var loader = new TextLoader(dataPath, useHeader: true, separator: "tab", allowQuotedStrings: false, supportSparse: true); + var loader = new Data.TextLoader(dataPath).CreateFrom(useHeader: true, allowQuotedStrings: false, supportSparse: true); using (var environment = new TlcEnvironment()) { @@ -176,7 +177,7 @@ public void CanSuccessfullyRetrieveSparseData() public void CanSuccessfullyTrimSpaces() { string dataPath = GetDataPath("TrimData.csv"); - var loader = new TextLoader(dataPath, useHeader: true, separator: ",", allowQuotedStrings: false, supportSparse: false, trimWhitespace: true); + var loader = new Data.TextLoader(dataPath).CreateFrom(useHeader: true, separator: ',', allowQuotedStrings: false, supportSparse: false, trimWhitespace: true); using (var environment = new TlcEnvironment()) { @@ -223,7 +224,7 @@ public void CanSuccessfullyTrimSpaces() [Fact] public void ThrowsExceptionWithPropertyName() { - Exception ex = Assert.Throws( () => new TextLoader("fakefile.txt") ); + Exception ex = Assert.Throws( () => new Data.TextLoader("fakefile.txt").CreateFrom() ); Assert.StartsWith("String1 is missing ColumnAttribute", ex.Message); }