From e46817f01c0c2e09dd4a8c8fe40953bc9a991218 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=84=E6=9C=A8?= Date: Mon, 16 Sep 2019 10:06:59 +0800 Subject: [PATCH 1/2] move design docs to a folder --- doc/{ => design}/alps_submitter.md | 0 doc/{ => design}/analyzer_design.md | 0 doc/{ => design}/ant-xgboost_design.md | 0 doc/{ => design}/auth_design.md | 2 +- doc/{ => design}/cluster_design.md | 2 +- doc/{ => design}/customized+model.md | 0 .../design_intermediate_representation.md | 0 doc/{ => design}/elasticdl_on_sqlflow.md | 0 doc/{ => design}/feature_derivation.md | 0 doc/{ => design}/pipe.md | 0 doc/{ => design}/sql_parser.md | 0 .../support_multiple_sql_statements.md | 0 doc/{ => design}/training_and_validation.md | 4 +- doc/{ => design}/xgboost_on_sqlflow_design.md | 0 doc/submitter.md | 92 ---------- doc/syntax.md | 160 ------------------ 16 files changed, 4 insertions(+), 256 deletions(-) rename doc/{ => design}/alps_submitter.md (100%) rename doc/{ => design}/analyzer_design.md (100%) rename doc/{ => design}/ant-xgboost_design.md (100%) rename doc/{ => design}/auth_design.md (99%) rename doc/{ => design}/cluster_design.md (98%) rename doc/{ => design}/customized+model.md (100%) rename doc/{ => design}/design_intermediate_representation.md (100%) rename doc/{ => design}/elasticdl_on_sqlflow.md (100%) rename doc/{ => design}/feature_derivation.md (100%) rename doc/{ => design}/pipe.md (100%) rename doc/{ => design}/sql_parser.md (100%) rename doc/{ => design}/support_multiple_sql_statements.md (100%) rename doc/{ => design}/training_and_validation.md (97%) rename doc/{ => design}/xgboost_on_sqlflow_design.md (100%) delete mode 100644 doc/submitter.md delete mode 100644 doc/syntax.md diff --git a/doc/alps_submitter.md b/doc/design/alps_submitter.md similarity index 100% rename from doc/alps_submitter.md rename to doc/design/alps_submitter.md diff --git a/doc/analyzer_design.md b/doc/design/analyzer_design.md similarity index 100% rename from doc/analyzer_design.md rename to doc/design/analyzer_design.md diff --git a/doc/ant-xgboost_design.md b/doc/design/ant-xgboost_design.md similarity index 100% rename from doc/ant-xgboost_design.md rename to doc/design/ant-xgboost_design.md diff --git a/doc/auth_design.md b/doc/design/auth_design.md similarity index 99% rename from doc/auth_design.md rename to doc/design/auth_design.md index 93f0368801..80077541ca 100644 --- a/doc/auth_design.md +++ b/doc/design/auth_design.md @@ -72,7 +72,7 @@ that case, we store session data into a reliable storage service like The below figure demonstrates overall workflow for authorization and authentication. - + Users can access the JupyterHub web page using their own username and password. The user's identity will be verified by the [SSO](https://en.wikipedia.org/wiki/Single_sign-on) diff --git a/doc/cluster_design.md b/doc/design/cluster_design.md similarity index 98% rename from doc/cluster_design.md rename to doc/design/cluster_design.md index 4f7a5e83ee..62ccac63a1 100644 --- a/doc/cluster_design.md +++ b/doc/design/cluster_design.md @@ -9,7 +9,7 @@ This design document introduced how to support the `Cluster Model` in SQLFLow. The figure below demonstrates the overall workflow for cluster model training, which include both the pre_train autoencoder model and the clustering model.(Reference https://www.dlology.com/blog/how-to-do-unsupervised-clustering-with-keras/) -
+
1. The first part is used to load a pre_trained model. We use the output of the trained encoder layer as the input to the clustering model. 2. Then, the clustering model starts training with randomly initialized weights, and generate clusters after multiple iterations. diff --git a/doc/customized+model.md b/doc/design/customized+model.md similarity index 100% rename from doc/customized+model.md rename to doc/design/customized+model.md diff --git a/doc/design_intermediate_representation.md b/doc/design/design_intermediate_representation.md similarity index 100% rename from doc/design_intermediate_representation.md rename to doc/design/design_intermediate_representation.md diff --git a/doc/elasticdl_on_sqlflow.md b/doc/design/elasticdl_on_sqlflow.md similarity index 100% rename from doc/elasticdl_on_sqlflow.md rename to doc/design/elasticdl_on_sqlflow.md diff --git a/doc/feature_derivation.md b/doc/design/feature_derivation.md similarity index 100% rename from doc/feature_derivation.md rename to doc/design/feature_derivation.md diff --git a/doc/pipe.md b/doc/design/pipe.md similarity index 100% rename from doc/pipe.md rename to doc/design/pipe.md diff --git a/doc/sql_parser.md b/doc/design/sql_parser.md similarity index 100% rename from doc/sql_parser.md rename to doc/design/sql_parser.md diff --git a/doc/support_multiple_sql_statements.md b/doc/design/support_multiple_sql_statements.md similarity index 100% rename from doc/support_multiple_sql_statements.md rename to doc/design/support_multiple_sql_statements.md diff --git a/doc/training_and_validation.md b/doc/design/training_and_validation.md similarity index 97% rename from doc/training_and_validation.md rename to doc/design/training_and_validation.md index 7622769865..a9740bd5bf 100644 --- a/doc/training_and_validation.md +++ b/doc/design/training_and_validation.md @@ -5,7 +5,7 @@ A common ML training job usually involves two kinds of data sets: training data ## Overall SQLFlow generates a temporary table following the user-specific table, trains and evaluates a model. - + Notice, we talk about the **train** process in this post. @@ -125,4 +125,4 @@ In the end, SQLFlow remove the temporary table to release resources. - If the column sqlflow_random already exists, SQLFlow chooses to quit Notice, *column name started with an underscore is invalid in the hive* -- Any discussion to implement a better splitting is welcomed \ No newline at end of file +- Any discussion to implement a better splitting is welcomed diff --git a/doc/xgboost_on_sqlflow_design.md b/doc/design/xgboost_on_sqlflow_design.md similarity index 100% rename from doc/xgboost_on_sqlflow_design.md rename to doc/design/xgboost_on_sqlflow_design.md diff --git a/doc/submitter.md b/doc/submitter.md deleted file mode 100644 index 5949c97f78..0000000000 --- a/doc/submitter.md +++ /dev/null @@ -1,92 +0,0 @@ -# Submitter - -A submitter is a pluggable module in SQLFlow that is used to submit an ML job to a third party computation service. - -## Workflow - -When a user types in an extended SQL statement, SQLFlow first parses and semantically verifies the statement. Then SQLFlow either runs the ML job locally or submits the ML job to a third party computation service. - -![](figures/sqlflow-arch2.png) - -In the latter case, SQLFlow produces a job description (`TrainDescription` or `PredictDescription`) and hands it over to the submitter. For a training SQL, SQLFlow produces `TrainDescription`; for prediction SQL, SQLFlow produces `PredDescription`. The concrete definition of the description looks like the following - -```go -type ColumnType struct { - Name string // e.g. sepal_length - DatabaseTypeName string // e.g. FLOAT -} - -// SELECT * -// FROM iris.train -// TRAIN DNNClassifier -// WITH -// n_classes = 3, -// hidden_units = [10, 20] -// COLUMN sepal_length, sepal_width, petal_length, petal_width -// LABEL class -// INTO sqlflow_models.my_dnn_model; -type TrainDescription struct { - StandardSelect string // e.g. SELECT * FROM iris.train - Estimator string // e.g. DNNClassifier - Attrs map[string]string // e.g. "n_classes": "3", "hidden_units": "[10, 20]" - X []ColumnType // e.g. "sepal_length": "FLOAT", ... - Y ColumnType // e.g. "class": "INT" - ModelName string // e.g. my_dnn_model -} - -// SELECT * -// FROM iris.test -// PREDICT iris.predict.class -// USING sqlflow_models.my_dnn_model; -type PredDescription struct { - StandardSelect string // e.g. SELECT * FROM iris.test - TableName string // e.g. iris.predict - ModelName string // e.g. my_dnn_model -} -``` - -## Submitter Interface - -The submitter interface should provide two functions `Train` and `Predict`. The detailed definition can be the following - -```go -type Submitter interface { - // Train executes a ML training job and streams job's response through writer. - // A typical Train function should include - // - Loading the training data - // - Initializing the model - // - model.train - // - Saving the trained model to a persistent storage - Train(desc TrainDescription, writer PipeWriter) error - // Predict executes a ML predicting job and streams job's response through writer - // A typical Predict function should include - // - Loading the model from a persistent storage - // - Loading the prediction data - // - model.predict - // - Writing the prediction result to a table - Predict(desc PredictDescription, writer PipeWriter) error -} -``` - -## Register a submitter - -A new submitter can be added as - -```go -import ( - ".../my_submitter" - ".../sqlflow/sql" -) - -func main() { - // ... - sql.Register(my_submitter.NewSubmitter()) - // ... - for { - sql := recv() - sql.Run(sql) - } -} -``` - -where `sql.Register` will put `my_submitter` instance to package level registry. During `sql.Run`, it will check whether there is a submitter registered. If there is, `sql.Run` will run either `submitter.Train` or `submitter.Predict`. diff --git a/doc/syntax.md b/doc/syntax.md deleted file mode 100644 index 74a7c255a9..0000000000 --- a/doc/syntax.md +++ /dev/null @@ -1,160 +0,0 @@ -# SQLFlow: Design Doc - -## What is SQLFlow - -SQLFlow is a bridge that connects a SQL engine, for example, MySQL, Hive, SparkSQL, Oracle, or SQL Server, with machine learning toolkits like TensorFlow. SQLFlow extends the SQL syntax to enable model training and inference. - -## Related Work - -We could write simple machine learning prediction (or scoring) algorithms in SQL using operators like [`DOT_PRODUCT`](https://thenewstack.io/sql-fans-can-now-develop-ml-applications/). However, this requires copy-n-pasting model parameters from the training program into SQL statements. - -Some proprietary SQL engines provide extensions to support machine learning. - -### Microsoft SQL Server - -Microsoft SQL Server has the [machine learning service](https://docs.microsoft.com/en-us/sql/advanced-analytics/tutorials/rtsql-create-a-predictive-model-r?view=sql-server-2017) that runs machine learning programs in R or Python as an external script: - -```sql -CREATE PROCEDURE generate_linear_model -AS -BEGIN - EXEC sp_execute_external_script - @language = N'R' - , @script = N'lrmodel <- rxLinMod(formula = distance ~ speed, data = CarsData); - trained_model <- data.frame(payload = as.raw(serialize(lrmodel, connection=NULL)));' - , @input_data_1 = N'SELECT [speed], [distance] FROM CarSpeed' - , @input_data_1_name = N'CarsData' - , @output_data_1_name = N'trained_model' - WITH RESULT SETS ((model varbinary(max))); -END; -``` - -A challenge to the users is that they need to know not only SQL but also R or Python, and they must be capable of writing machine learning programs in R or Python. - -### Teradata SQL for DL - -Teradata also provides a [RESTful service](https://www.linkedin.com/pulse/sql-deep-learning-sql-dl-omri-shiv), which is callable from the extended SQL SELECT syntax. - -```sql -SELECT * FROM deep_learning_scorer( - ON (SELECT * FROM cc_data LIMIT 100) - URL('http://localhost:8000/api/v1/request') - ModelName('cc') - ModelVersion('1') - RequestType('predict') - columns('v1', 'v2', ..., 'amount') -) -``` - -The above syntax couples the deployment of the service (the URL in the above SQL statement) with the algorithm. - -### Google BigQuery - -Google [BigQuery](https://cloud.google.com/bigquery/docs/bigqueryml-intro) enables machine learning in SQL by introducing the `CREATE MODEL` statement. - -```sql -CREATE MODEL dataset.model_name - OPTIONS(model_type='linear_reg', input_label_cols=['input_label']) -AS SELECT * FROM input_table; -``` - -Currently, BigQuery only supports two simple models: linear regression and logistic regression. - -## Design Goal - -None of the above meets our requirement. - -First of all, we want to build an open source software. Also, we want it to be extensible: - -- We want it extensible to many SQL engines, instead of targeting any one of them. Therefore, we don't want to build our syntax extension on top of user-defined functions (UDFs); otherwise, we'd have to implement them for each SQL engine. - -- We want the system extensible to support sophisticated machine learning models and toolkits, including TensorFlow for deep learning and [xgboost](https://github.com/dmlc/xgboost) for trees. - -Another challenge is that we want SQLFlow to be flexible enough to configure and run cutting-edge algorithms, including specifying [feature crosses](https://www.tensorflow.org/api_docs/python/tf/feature_column/crossed_column). At the same time, we want SQLFlow easy to learn -- at least, no Python or R code embedded in the SQL statements, and integrate hyperparameter estimation. - -We understand that a key to address the above challenges is the syntax of the SQL extension. To craft a highly-effective and easy-to-learn syntax, we need user feedback and fast iteration. Therefore, we'd start from a prototype that supports only MySQL and TensorFlow. We plan to support more SQL engines and machine learning toolkits later. - -## Design Decisions - -As the beginning of the iteration, we propose an extension to the SQL SELECT statement. We are not going a new statement way like that BigQuery provides `CREATE MODEL`, because we want to maintain a loose couple between our system and the underlying SQL engine, and we cannot create the new data type for the SQL engine, like `CREATE MODEL` requires. - -We highly appreciate the work of [TensorFlow Estimator](https://www.tensorflow.org/guide/estimators), a high-level API for deep learning. The basic idea behind Estimator is to implement each deep learning model, and related training/testing/evaluating algorithms as a Python class derived from `tf.estimator.Estimator`. As we want to keep our SQL syntax simple, we would make the system extensible by calling estimators contributed by machine learning experts and written in Python. - -The SQL syntax must allow users to set Estimator attributes (parameters of the Python class' constructor, and those of `train`, `evaluate`, or `predict`). Users can choose to use default values. We have a plan to integrate our hyperparameter estimation research into the system to optimize the default values. - -Though estimators derived from `tf.estimator.Estimator` run algorithms as TensorFlow graphs; SQLFlow doesn't restrict that the underlying machine learning toolkit has to be TensorFlow. Indeed, as long as an estimator provides methods of `train`, `evaluate`, and `predict`, SQLFlow doesn't care if it calls TensorFlow or xgboost. Precisely, what SQLFlow expect is an interface like the following: - -```python -class AnEstimatorClass: - __init__(self, **kwargs) - train(self, **kwargs) - evaluate(self, **kwargs) - predict(self, **kwargs) -``` - -We also want to reuse the [feature columns API](https://www.tensorflow.org/guide/feature_columns) from Estimator, which allows users to columns of tables in a SQL engine to features to the model. - - -## Extended SQL Syntax - -Again, just as the beginning of the iteration, we propose the syntax for training as - -```sql -SELECT * FROM kaggle_credit_fraud_training_data -LIMIT 1000 -TRAIN DNNClassifier /* a pre-defined TensorFlow estimator, tf.estimator.DNNClassifier */ -WITH layers=[100, 200], /* a parameter of the Estimator class constructor */ - train.batch_size = 8 /* a parameter of the Estimator.train method */ -COLUMN *, /* all columns as raw features */ - cross(v1, v9, v28) /* plus a derived (crossed) column */ -LABEL class -INTO sqlflow_models.my_model_table; /* saves trained model parameters and features into a table */ -``` - -We see the redundancy of `*` in two clauses: `SELECT` and `COLUMN`. The following alternative can avoid the redundancy, but cannot specify the label. - -```sql -SELECT * /* raw features or the label? */ - cross(v1, v9, v28) /* derived featuers */ -FROM kaggle_credit_fraud_training_data -``` - -Please be aware that we save the trained models into tables, instead of a variable maintained by the underlying SQL engine. To invent a new variable type to hold trained models, we'd make our system tightly integrated with the SQL engine, and harms the extensibility to other engines. - -The result table should include the following information: - -1. The estimator name, e.g., `DNNClassifier` in this case. -1. Estimator attributes, e.g., `layer` and `train.batch_size`. -1. The feature mapping, e.g., `*` and `cross(v1, v9, v28)`. - -Similarly, to infer the class (fraud or regular), we could - -```sql -SELECT * FROM kaggle_credit_fraud_development_data -PREDICT kaggle_credit_fraud_development_data.class -USING sqlflow_models.my_model_table; -``` - -## System Architecture - -### A Conceptual Overview - -In the prototype, we use the following architecture: - -``` -SQL statement -> our SQL parser --standard SQL-> MySQL - \-extended SQL-> code generator -> execution engine -``` - -In the prototype, the code generator generates a Python program that trains or predicts. In either case, - -1. it retrieves the data from MySQL via [MySQL Connector Python API](https://dev.mysql.com/downloads/connector/python/), -1. optionally, retrieves the model from MySQL, -1. trains the model or predicts using the trained model by calling the user specified TensorFlow estimator, -1. and writes the trained model or prediction results into a table. - -### Working with Jupyter Notebook and Kubernetes - -The following figure shows the system components and their runtime environment. The left part shows how to run the system on a PC/laptop, and the right part shows how to run it on a Kubernetes cluster. - -![](figures/sqlflow-arch.png) From bd033343d9a874027373629a5a1cdbc8a07f4b9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=84=E6=9C=A8?= Date: Mon, 16 Sep 2019 12:03:12 +0800 Subject: [PATCH 2/2] update --- README.md | 4 +- ..._submitter.md => design_alps_submitter.md} | 2 +- ...{analyzer_design.md => design_analyzer.md} | 2 +- ...gboost_design.md => design_ant_xgboost.md} | 0 doc/design/{auth_design.md => design_auth.md} | 2 +- ...uster_design.md => design_clustermodel.md} | 2 +- ...ed+model.md => design_customized_model.md} | 2 +- .../design_database_abstraction_layer.md} | 4 +- ...flow.md => design_elasticdl_on_sqlflow.md} | 0 ...vation.md => design_feature_derivation.md} | 2 +- .../design_intermediate_representation.md | 6 +- doc/design/{pipe.md => design_pipe.md} | 4 +- .../{sql_parser.md => design_sql_parser.md} | 2 +- doc/design/design_submitter.md | 92 ++++++++++ ...design_support_multiple_sql_statements.md} | 0 doc/design/design_syntax.md | 160 ++++++++++++++++++ ...n.md => design_training_and_validation.md} | 2 +- ...design.md => design_xgboost_on_sqlflow.md} | 2 +- doc/text_classification_demo.md | 2 +- 19 files changed, 271 insertions(+), 19 deletions(-) rename doc/design/{alps_submitter.md => design_alps_submitter.md} (99%) rename doc/design/{analyzer_design.md => design_analyzer.md} (97%) rename doc/design/{ant-xgboost_design.md => design_ant_xgboost.md} (100%) rename doc/design/{auth_design.md => design_auth.md} (99%) rename doc/design/{cluster_design.md => design_clustermodel.md} (99%) rename doc/design/{customized+model.md => design_customized_model.md} (99%) rename doc/{database_abstraction_layer.md => design/design_database_abstraction_layer.md} (89%) rename doc/design/{elasticdl_on_sqlflow.md => design_elasticdl_on_sqlflow.md} (100%) rename doc/design/{feature_derivation.md => design_feature_derivation.md} (99%) rename doc/design/{pipe.md => design_pipe.md} (90%) rename doc/design/{sql_parser.md => design_sql_parser.md} (98%) create mode 100644 doc/design/design_submitter.md rename doc/design/{support_multiple_sql_statements.md => design_support_multiple_sql_statements.md} (100%) create mode 100644 doc/design/design_syntax.md rename doc/design/{training_and_validation.md => design_training_and_validation.md} (99%) rename doc/design/{xgboost_on_sqlflow_design.md => design_xgboost_on_sqlflow.md} (98%) diff --git a/README.md b/README.md index e91d458b71..de04d73153 100644 --- a/README.md +++ b/README.md @@ -52,13 +52,13 @@ Done predicting. Predict table : iris.predict - [Installation](doc/installation.md) - [Running a Demo](doc/demo.md) -- [Extended SQL Syntax](doc/syntax.md) +- [User Guide](doc/user_guide.md) ## Contributions - [Build from source](doc/build.md) - [The walkthrough of the source code](doc/walkthrough.md) -- [The choice of parser generator](doc/sql_parser.md) +- [The choice of parser generator](doc/design/design_sql_parser.md) ## Roadmap diff --git a/doc/design/alps_submitter.md b/doc/design/design_alps_submitter.md similarity index 99% rename from doc/design/alps_submitter.md rename to doc/design/design_alps_submitter.md index 44ba70fdce..d2190290d6 100644 --- a/doc/design/alps_submitter.md +++ b/doc/design/design_alps_submitter.md @@ -1,4 +1,4 @@ -# Proof of Concept: ALPS Submitter +# _Design:_ ALPS Submitter ALPS (Ant Learning and Prediction Suite) provides a common algorithm-driven framework in Ant Financial, focusing on providing users with an efficient and easy-to-use machine learning programming framework and a financial learning machine learning algorithm solution. diff --git a/doc/design/analyzer_design.md b/doc/design/design_analyzer.md similarity index 97% rename from doc/design/analyzer_design.md rename to doc/design/design_analyzer.md index ae17cd1460..4fa68aa06c 100644 --- a/doc/design/analyzer_design.md +++ b/doc/design/design_analyzer.md @@ -1,4 +1,4 @@ -# Design: Analyze the Machine Learning Mode in SQLFlow +# _Design:_ Analyze the Machine Learning Mode in SQLFlow ## Concept diff --git a/doc/design/ant-xgboost_design.md b/doc/design/design_ant_xgboost.md similarity index 100% rename from doc/design/ant-xgboost_design.md rename to doc/design/design_ant_xgboost.md diff --git a/doc/design/auth_design.md b/doc/design/design_auth.md similarity index 99% rename from doc/design/auth_design.md rename to doc/design/design_auth.md index 80077541ca..ef66c00e33 100644 --- a/doc/design/auth_design.md +++ b/doc/design/design_auth.md @@ -1,4 +1,4 @@ -# Design: SQLFlow Authentication and Authorization +# _Design:_ SQLFlow Authentication and Authorization ## Concepts diff --git a/doc/design/cluster_design.md b/doc/design/design_clustermodel.md similarity index 99% rename from doc/design/cluster_design.md rename to doc/design/design_clustermodel.md index 62ccac63a1..f1a41d4019 100644 --- a/doc/design/cluster_design.md +++ b/doc/design/design_clustermodel.md @@ -1,4 +1,4 @@ -# Design: Clustering in SQLflow to analyze patterns in data +# _Design:_ Clustering in SQLflow to analyze patterns in data ## ClusterModel introduction diff --git a/doc/design/customized+model.md b/doc/design/design_customized_model.md similarity index 99% rename from doc/design/customized+model.md rename to doc/design/design_customized_model.md index 7f9d5a31dd..3b917afe63 100644 --- a/doc/design/customized+model.md +++ b/doc/design/design_customized_model.md @@ -1,4 +1,4 @@ -# Design Doc: Define Models for SQLFlow +# _Design:_ Define Models for SQLFlow SQLFlow enables SQL programs to call deep learning models defined in Python. This document is about how to define models for SQLFlow. diff --git a/doc/database_abstraction_layer.md b/doc/design/design_database_abstraction_layer.md similarity index 89% rename from doc/database_abstraction_layer.md rename to doc/design/design_database_abstraction_layer.md index 96a4d42963..8606bd0de5 100644 --- a/doc/database_abstraction_layer.md +++ b/doc/design/design_database_abstraction_layer.md @@ -1,4 +1,4 @@ -# Compatibility with Various SQL Engines +# _Design:_ Compatibility with Various SQL Engines SQLFlow interacts with SQL engines like MySQL and Hive, while different SQL engines use variants of SQL syntax, it is important for SQLFlow to have an abstraction layer that hides such differences. @@ -8,7 +8,7 @@ SQLFlow calls Go's [standard database API](https://golang.org/pkg/database/sql/) ### Data Retrieval -The basic idea of SQLFlow is to extend the SELECT statement of SQL to have the TRAIN and PREDICT clauses. For more discussion, please refer to the [syntax design](/doc/syntax.md). SQLFlow translates such "extended SQL statements" into submitter programs, which forward the part from SELECT to TRAIN or PREDICT, which we call the "standard part", to the SQL engine. SQLFlow also accepts the SELECT statement without TRAIN or PREDICT clauses and would forward such "standard statements" to the engine. It is noticeable that the "standard part" or "standard statements" are not standardized. For example, various engines use different syntax for `FULL OUTER JOIN`. +The basic idea of SQLFlow is to extend the SELECT statement of SQL to have the TRAIN and PREDICT clauses. For more discussion, please refer to the [syntax design](/doc/design/design_syntax.md). SQLFlow translates such "extended SQL statements" into submitter programs, which forward the part from SELECT to TRAIN or PREDICT, which we call the "standard part", to the SQL engine. SQLFlow also accepts the SELECT statement without TRAIN or PREDICT clauses and would forward such "standard statements" to the engine. It is noticeable that the "standard part" or "standard statements" are not standardized. For example, various engines use different syntax for `FULL OUTER JOIN`. - Hive supports `FULL OUTER JOIN` directly. - MySQL doesn't have `FULL OUTER JOIN`. However, a user can emulates `FULL OUTER JOIN` using `LEFT JOIN`, `UNION` and `RIGHT JOIN`. diff --git a/doc/design/elasticdl_on_sqlflow.md b/doc/design/design_elasticdl_on_sqlflow.md similarity index 100% rename from doc/design/elasticdl_on_sqlflow.md rename to doc/design/design_elasticdl_on_sqlflow.md diff --git a/doc/design/feature_derivation.md b/doc/design/design_feature_derivation.md similarity index 99% rename from doc/design/feature_derivation.md rename to doc/design/design_feature_derivation.md index 725408c1a9..08e38a5a39 100644 --- a/doc/design/feature_derivation.md +++ b/doc/design/design_feature_derivation.md @@ -1,4 +1,4 @@ -# Design: Feature Derivation +# _Design:_ Feature Derivation This file discusses the details and implementations of "Feature Derivation". Please refer to [this](https://medium.com/@SQLFlow/feature-derivation-the-conversion-from-sql-data-to-tensors-833519db1467) blog to diff --git a/doc/design/design_intermediate_representation.md b/doc/design/design_intermediate_representation.md index 1398947e48..4aae550d48 100644 --- a/doc/design/design_intermediate_representation.md +++ b/doc/design/design_intermediate_representation.md @@ -6,10 +6,10 @@ As SQLFlow is supporting more and more machine learning toolkits, the correspond The core `sql` package should include the following functionalities: 1. The entry point of running extended SQL statements. -1. The [parsing](https://github.com/sql-machine-learning/sqlflow/blob/develop/doc/sql_parser.md) of extended SQL statements. +1. The [parsing](https://github.com/sql-machine-learning/sqlflow/blob/develop/doc/design/design_sql_parser.md) of extended SQL statements. 1. The verification of extended SQL statements, including verifying the syntax, the existence of the selected fields. -1. The [feature derivation](https://github.com/sql-machine-learning/sqlflow/blob/develop/doc/feature_derivation.md), including name, type, shape, and preprocessing method of the select fields. -1. The [training data and validation data split](https://github.com/sql-machine-learning/sqlflow/blob/develop/doc/training_and_validation.md). +1. The [feature derivation](https://github.com/sql-machine-learning/sqlflow/blob/develop/doc/design/design_feature_derivation.md), including name, type, shape, and preprocessing method of the select fields. +1. The [training data and validation data split](https://github.com/sql-machine-learning/sqlflow/blob/develop/doc/design/design_training_and_validation.md). With these functionalities, the `sql` package çan translate user typed extended SQL statements to an IR as an exposed Go struct. The codegen package takes the IR and returns a generated Python program for the `sql` package to execute. diff --git a/doc/design/pipe.md b/doc/design/design_pipe.md similarity index 90% rename from doc/design/pipe.md rename to doc/design/design_pipe.md index a0384ddfe0..f5c21c8d0e 100644 --- a/doc/design/pipe.md +++ b/doc/design/design_pipe.md @@ -1,9 +1,9 @@ -# Piping Responses +# _Design:_ Piping Responses ## Streaming Responses -As described in the [overall design](doc/syntax.md), a SQLFlow job could be a standard or an extended SQL statement, where an extended SQL statement will be translated into a Python program. Therefore, each job might generate up to the following data streams: +As described in the [overall design](design_syntax.md), a SQLFlow job could be a standard or an extended SQL statement, where an extended SQL statement will be translated into a Python program. Therefore, each job might generate up to the following data streams: 1. standard output, where each element is a line of text, 1. standard error, where each element is a line of text, diff --git a/doc/design/sql_parser.md b/doc/design/design_sql_parser.md similarity index 98% rename from doc/design/sql_parser.md rename to doc/design/design_sql_parser.md index d45f7ee2be..d0de80f939 100644 --- a/doc/design/sql_parser.md +++ b/doc/design/design_sql_parser.md @@ -1,4 +1,4 @@ -# Extended SQL Parser Design +# _Design:_ Extended SQL Parser This documentation explains the technical decision made in building a SQL parser in Go. It is used to parsed the extended SELECT syntax of SQL that diff --git a/doc/design/design_submitter.md b/doc/design/design_submitter.md new file mode 100644 index 0000000000..55e4046d31 --- /dev/null +++ b/doc/design/design_submitter.md @@ -0,0 +1,92 @@ +# _Design:_ Submitter + +A submitter is a pluggable module in SQLFlow that is used to submit an ML job to a third party computation service. + +## Workflow + +When a user types in an extended SQL statement, SQLFlow first parses and semantically verifies the statement. Then SQLFlow either runs the ML job locally or submits the ML job to a third party computation service. + +![](../figures/sqlflow-arch2.png) + +In the latter case, SQLFlow produces a job description (`TrainDescription` or `PredictDescription`) and hands it over to the submitter. For a training SQL, SQLFlow produces `TrainDescription`; for prediction SQL, SQLFlow produces `PredDescription`. The concrete definition of the description looks like the following + +```go +type ColumnType struct { + Name string // e.g. sepal_length + DatabaseTypeName string // e.g. FLOAT +} + +// SELECT * +// FROM iris.train +// TRAIN DNNClassifier +// WITH +// n_classes = 3, +// hidden_units = [10, 20] +// COLUMN sepal_length, sepal_width, petal_length, petal_width +// LABEL class +// INTO sqlflow_models.my_dnn_model; +type TrainDescription struct { + StandardSelect string // e.g. SELECT * FROM iris.train + Estimator string // e.g. DNNClassifier + Attrs map[string]string // e.g. "n_classes": "3", "hidden_units": "[10, 20]" + X []ColumnType // e.g. "sepal_length": "FLOAT", ... + Y ColumnType // e.g. "class": "INT" + ModelName string // e.g. my_dnn_model +} + +// SELECT * +// FROM iris.test +// PREDICT iris.predict.class +// USING sqlflow_models.my_dnn_model; +type PredDescription struct { + StandardSelect string // e.g. SELECT * FROM iris.test + TableName string // e.g. iris.predict + ModelName string // e.g. my_dnn_model +} +``` + +## Submitter Interface + +The submitter interface should provide two functions `Train` and `Predict`. The detailed definition can be the following + +```go +type Submitter interface { + // Train executes a ML training job and streams job's response through writer. + // A typical Train function should include + // - Loading the training data + // - Initializing the model + // - model.train + // - Saving the trained model to a persistent storage + Train(desc TrainDescription, writer PipeWriter) error + // Predict executes a ML predicting job and streams job's response through writer + // A typical Predict function should include + // - Loading the model from a persistent storage + // - Loading the prediction data + // - model.predict + // - Writing the prediction result to a table + Predict(desc PredictDescription, writer PipeWriter) error +} +``` + +## Register a submitter + +A new submitter can be added as + +```go +import ( + ".../my_submitter" + ".../sqlflow/sql" +) + +func main() { + // ... + sql.Register(my_submitter.NewSubmitter()) + // ... + for { + sql := recv() + sql.Run(sql) + } +} +``` + +where `sql.Register` will put `my_submitter` instance to package level registry. During `sql.Run`, it will check whether there is a submitter registered. If there is, `sql.Run` will run either `submitter.Train` or `submitter.Predict`. diff --git a/doc/design/support_multiple_sql_statements.md b/doc/design/design_support_multiple_sql_statements.md similarity index 100% rename from doc/design/support_multiple_sql_statements.md rename to doc/design/design_support_multiple_sql_statements.md diff --git a/doc/design/design_syntax.md b/doc/design/design_syntax.md new file mode 100644 index 0000000000..d27d5d6e76 --- /dev/null +++ b/doc/design/design_syntax.md @@ -0,0 +1,160 @@ +# _Design:_ SQLFlow + +## What is SQLFlow + +SQLFlow is a bridge that connects a SQL engine, for example, MySQL, Hive, SparkSQL, Oracle, or SQL Server, with machine learning toolkits like TensorFlow. SQLFlow extends the SQL syntax to enable model training and inference. + +## Related Work + +We could write simple machine learning prediction (or scoring) algorithms in SQL using operators like [`DOT_PRODUCT`](https://thenewstack.io/sql-fans-can-now-develop-ml-applications/). However, this requires copy-n-pasting model parameters from the training program into SQL statements. + +Some proprietary SQL engines provide extensions to support machine learning. + +### Microsoft SQL Server + +Microsoft SQL Server has the [machine learning service](https://docs.microsoft.com/en-us/sql/advanced-analytics/tutorials/rtsql-create-a-predictive-model-r?view=sql-server-2017) that runs machine learning programs in R or Python as an external script: + +```sql +CREATE PROCEDURE generate_linear_model +AS +BEGIN + EXEC sp_execute_external_script + @language = N'R' + , @script = N'lrmodel <- rxLinMod(formula = distance ~ speed, data = CarsData); + trained_model <- data.frame(payload = as.raw(serialize(lrmodel, connection=NULL)));' + , @input_data_1 = N'SELECT [speed], [distance] FROM CarSpeed' + , @input_data_1_name = N'CarsData' + , @output_data_1_name = N'trained_model' + WITH RESULT SETS ((model varbinary(max))); +END; +``` + +A challenge to the users is that they need to know not only SQL but also R or Python, and they must be capable of writing machine learning programs in R or Python. + +### Teradata SQL for DL + +Teradata also provides a [RESTful service](https://www.linkedin.com/pulse/sql-deep-learning-sql-dl-omri-shiv), which is callable from the extended SQL SELECT syntax. + +```sql +SELECT * FROM deep_learning_scorer( + ON (SELECT * FROM cc_data LIMIT 100) + URL('http://localhost:8000/api/v1/request') + ModelName('cc') + ModelVersion('1') + RequestType('predict') + columns('v1', 'v2', ..., 'amount') +) +``` + +The above syntax couples the deployment of the service (the URL in the above SQL statement) with the algorithm. + +### Google BigQuery + +Google [BigQuery](https://cloud.google.com/bigquery/docs/bigqueryml-intro) enables machine learning in SQL by introducing the `CREATE MODEL` statement. + +```sql +CREATE MODEL dataset.model_name + OPTIONS(model_type='linear_reg', input_label_cols=['input_label']) +AS SELECT * FROM input_table; +``` + +Currently, BigQuery only supports two simple models: linear regression and logistic regression. + +## Design Goal + +None of the above meets our requirement. + +First of all, we want to build an open source software. Also, we want it to be extensible: + +- We want it extensible to many SQL engines, instead of targeting any one of them. Therefore, we don't want to build our syntax extension on top of user-defined functions (UDFs); otherwise, we'd have to implement them for each SQL engine. + +- We want the system extensible to support sophisticated machine learning models and toolkits, including TensorFlow for deep learning and [xgboost](https://github.com/dmlc/xgboost) for trees. + +Another challenge is that we want SQLFlow to be flexible enough to configure and run cutting-edge algorithms, including specifying [feature crosses](https://www.tensorflow.org/api_docs/python/tf/feature_column/crossed_column). At the same time, we want SQLFlow easy to learn -- at least, no Python or R code embedded in the SQL statements, and integrate hyperparameter estimation. + +We understand that a key to address the above challenges is the syntax of the SQL extension. To craft a highly-effective and easy-to-learn syntax, we need user feedback and fast iteration. Therefore, we'd start from a prototype that supports only MySQL and TensorFlow. We plan to support more SQL engines and machine learning toolkits later. + +## Design Decisions + +As the beginning of the iteration, we propose an extension to the SQL SELECT statement. We are not going a new statement way like that BigQuery provides `CREATE MODEL`, because we want to maintain a loose couple between our system and the underlying SQL engine, and we cannot create the new data type for the SQL engine, like `CREATE MODEL` requires. + +We highly appreciate the work of [TensorFlow Estimator](https://www.tensorflow.org/guide/estimators), a high-level API for deep learning. The basic idea behind Estimator is to implement each deep learning model, and related training/testing/evaluating algorithms as a Python class derived from `tf.estimator.Estimator`. As we want to keep our SQL syntax simple, we would make the system extensible by calling estimators contributed by machine learning experts and written in Python. + +The SQL syntax must allow users to set Estimator attributes (parameters of the Python class' constructor, and those of `train`, `evaluate`, or `predict`). Users can choose to use default values. We have a plan to integrate our hyperparameter estimation research into the system to optimize the default values. + +Though estimators derived from `tf.estimator.Estimator` run algorithms as TensorFlow graphs; SQLFlow doesn't restrict that the underlying machine learning toolkit has to be TensorFlow. Indeed, as long as an estimator provides methods of `train`, `evaluate`, and `predict`, SQLFlow doesn't care if it calls TensorFlow or xgboost. Precisely, what SQLFlow expect is an interface like the following: + +```python +class AnEstimatorClass: + __init__(self, **kwargs) + train(self, **kwargs) + evaluate(self, **kwargs) + predict(self, **kwargs) +``` + +We also want to reuse the [feature columns API](https://www.tensorflow.org/guide/feature_columns) from Estimator, which allows users to columns of tables in a SQL engine to features to the model. + + +## Extended SQL Syntax + +Again, just as the beginning of the iteration, we propose the syntax for training as + +```sql +SELECT * FROM kaggle_credit_fraud_training_data +LIMIT 1000 +TRAIN DNNClassifier /* a pre-defined TensorFlow estimator, tf.estimator.DNNClassifier */ +WITH layers=[100, 200], /* a parameter of the Estimator class constructor */ + train.batch_size = 8 /* a parameter of the Estimator.train method */ +COLUMN *, /* all columns as raw features */ + cross(v1, v9, v28) /* plus a derived (crossed) column */ +LABEL class +INTO sqlflow_models.my_model_table; /* saves trained model parameters and features into a table */ +``` + +We see the redundancy of `*` in two clauses: `SELECT` and `COLUMN`. The following alternative can avoid the redundancy, but cannot specify the label. + +```sql +SELECT * /* raw features or the label? */ + cross(v1, v9, v28) /* derived featuers */ +FROM kaggle_credit_fraud_training_data +``` + +Please be aware that we save the trained models into tables, instead of a variable maintained by the underlying SQL engine. To invent a new variable type to hold trained models, we'd make our system tightly integrated with the SQL engine, and harms the extensibility to other engines. + +The result table should include the following information: + +1. The estimator name, e.g., `DNNClassifier` in this case. +1. Estimator attributes, e.g., `layer` and `train.batch_size`. +1. The feature mapping, e.g., `*` and `cross(v1, v9, v28)`. + +Similarly, to infer the class (fraud or regular), we could + +```sql +SELECT * FROM kaggle_credit_fraud_development_data +PREDICT kaggle_credit_fraud_development_data.class +USING sqlflow_models.my_model_table; +``` + +## System Architecture + +### A Conceptual Overview + +In the prototype, we use the following architecture: + +``` +SQL statement -> our SQL parser --standard SQL-> MySQL + \-extended SQL-> code generator -> execution engine +``` + +In the prototype, the code generator generates a Python program that trains or predicts. In either case, + +1. it retrieves the data from MySQL via [MySQL Connector Python API](https://dev.mysql.com/downloads/connector/python/), +1. optionally, retrieves the model from MySQL, +1. trains the model or predicts using the trained model by calling the user specified TensorFlow estimator, +1. and writes the trained model or prediction results into a table. + +### Working with Jupyter Notebook and Kubernetes + +The following figure shows the system components and their runtime environment. The left part shows how to run the system on a PC/laptop, and the right part shows how to run it on a Kubernetes cluster. + +![](figures/sqlflow-arch.png) diff --git a/doc/design/training_and_validation.md b/doc/design/design_training_and_validation.md similarity index 99% rename from doc/design/training_and_validation.md rename to doc/design/design_training_and_validation.md index a9740bd5bf..102c26742d 100644 --- a/doc/design/training_and_validation.md +++ b/doc/design/design_training_and_validation.md @@ -1,4 +1,4 @@ -# Design: Training and Validation +# _Design:_ Training and Validation A common ML training job usually involves two kinds of data sets: training data and validation data. These two data sets will be generated automatically by SQLFlow through randomly splitting the select results. diff --git a/doc/design/xgboost_on_sqlflow_design.md b/doc/design/design_xgboost_on_sqlflow.md similarity index 98% rename from doc/design/xgboost_on_sqlflow_design.md rename to doc/design/design_xgboost_on_sqlflow.md index 9f519746b5..2132906f8a 100644 --- a/doc/design/xgboost_on_sqlflow_design.md +++ b/doc/design/design_xgboost_on_sqlflow.md @@ -1,4 +1,4 @@ -# Design Doc: XGBoost on SQLFlow +# _Design:_ XGBoost on SQLFlow ## Introduction diff --git a/doc/text_classification_demo.md b/doc/text_classification_demo.md index 9a230f255d..0fae6b487b 100644 --- a/doc/text_classification_demo.md +++ b/doc/text_classification_demo.md @@ -5,7 +5,7 @@ Note that the steps in this tutorial may be changed during the development of SQLFlow, we only provide a way that simply works for the current version. To support custom models like CNN text classification, you may check out the -current [design](https://github.com/sql-machine-learning/models/blob/develop/doc/customized%2Bmodel.md) +current [design](https://github.com/sql-machine-learning/models/blob/develop/doc/design/design_customized_model.md) for ongoing development. In this tutorial we use two datasets both for english and chinese text classification.