diff --git a/.openpublishing.redirection.json b/.openpublishing.redirection.json index bb65e9511a1..3863e26b90b 100644 --- a/.openpublishing.redirection.json +++ b/.openpublishing.redirection.json @@ -50,6 +50,78 @@ { "source_path": "docs/elasticsearch/resilience-and-recover.md", "redirect_url": "/azure/architecture" - } + }, + { + "source_path": "docs/data-guide/scenarios/batch-processing.md", + "redirect_url": "/azure/architecture/data-guide/big-data/batch-processing" + }, + { + "source_path": "docs/data-guide/concepts/big-data.md", + "redirect_url": "/azure/architecture/data-guide/big-data" + }, + { + "source_path": "docs/data-guide/concepts/machine-learning-at-scale.md", + "redirect_url": "/azure/architecture/data-guide/big-data/machine-learning-at-scale" + }, + { + "source_path": "docs/data-guide/concepts/non-relational-data.md", + "redirect_url": "/azure/architecture/data-guide/big-data/non-relational-data" + }, + { + "source_path": "docs/data-guide/scenarios/real-time-processing.md", + "redirect_url": "/azure/architecture/data-guide/big-data/real-time-processing" + }, + { + "source_path": "docs/data-guide/technology-choices/data-warehouses.md", + "redirect_url": "/azure/architecture/data-guide/relational-data/data-warehousing" + }, + { + "source_path": "docs/data-guide/scenarios/etl.md", + "redirect_url": "/azure/architecture/data-guide/relational-data/etl" + }, + { + "source_path": "docs/data-guide/concepts/relational-data.md", + "redirect_url": "/azure/architecture/data-guide/relational-data" + }, + { + "source_path": "docs/data-guide/concepts/advanced-analytics.md", + "redirect_url": "/azure/architecture/data-guide/scenarios/advanced-analytics" + }, + { + "source_path": "docs/data-guide/concepts/csv-and-json.md", + "redirect_url": "/azure/architecture/data-guide/scenarios/csv-and-json" + }, + { + "source_path": "docs/data-guide/concepts/data-lake.md", + "redirect_url": "/azure/architecture/data-guide/scenarios/data-lake" + }, + { + "source_path": "docs/data-guide/concepts/semantic-modeling.md", + "redirect_url": "/azure/architecture/data-guide/relational-data/online-analytical-processing" + }, + { + "source_path": "docs/data-guide/concepts/transactional-data.md", + "redirect_url": "/azure/architecture/data-guide/relational-data/online-transaction-processing" + }, + { + "source_path": "docs/data-guide/scenarios/data-warehousing.md", + "redirect_url": "/azure/architecture/data-guide/relational-data/data-warehousing" + }, + { + "source_path": "docs/data-guide/scenarios/online-analytical-processing.md", + "redirect_url": "/azure/architecture/data-guide/relational-data/online-analytical-processing" + }, + { + "source_path": "docs/data-guide/scenarios/online-transaction-processing.md", + "redirect_url": "/azure/architecture/data-guide/relational-data/online-transaction-processing" + }, + { + "source_path": "docs/data-guide/technology-choices/olap-data-stores.md", + "redirect_url": "/azure/architecture/data-guide//relational-data/online-analytical-processing" + }, + { + "source_path": "docs/data-guide/technology-choices/oltp-data-stores.md", + "redirect_url": "/azure/architecture/data-guide/relational-data/online-transaction-processing" + } ] } \ No newline at end of file diff --git a/docs/data-guide/scenarios/batch-processing.md b/docs/data-guide/big-data/batch-processing.md similarity index 96% rename from docs/data-guide/scenarios/batch-processing.md rename to docs/data-guide/big-data/batch-processing.md index 39d0aad3420..a75331673c7 100644 --- a/docs/data-guide/scenarios/batch-processing.md +++ b/docs/data-guide/big-data/batch-processing.md @@ -15,7 +15,7 @@ For example, the logs from a web server might be copied to a folder and then pro ## When to use this solution -Batch processing is used in a variety of scenarios, from simple data transformations to a more complete ETL (extract-transform-load) pipeline. In a big data context, batch processing may operate over very large data sets, where the computation takes significant time. (For example, see [Lambda architecture](../concepts/big-data.md#lambda-architecture).) Batch processing typically leads to further interactive exploration, provides the modeling-ready data for machine learning, or writes the data to a data store that is optimized for analytics and visualization. +Batch processing is used in a variety of scenarios, from simple data transformations to a more complete ETL (extract-transform-load) pipeline. In a big data context, batch processing may operate over very large data sets, where the computation takes significant time. (For example, see [Lambda architecture](../big-data/index.md#lambda-architecture).) Batch processing typically leads to further interactive exploration, provides the modeling-ready data for machine learning, or writes the data to a data store that is optimized for analytics and visualization. One example of batch processing is transforming a large set of flat, semi-structured CSV or JSON files into a schematized and structured format that is ready for further querying. Typically the data is converted from the raw formats used for ingestion (such as CSV) into binary formats that are more performant for querying because they store data in a columnar format, and often provide indexes and inline statistics about the data. diff --git a/docs/data-guide/scenarios/images/batch-pipeline.png b/docs/data-guide/big-data/images/batch-pipeline.png similarity index 100% rename from docs/data-guide/scenarios/images/batch-pipeline.png rename to docs/data-guide/big-data/images/batch-pipeline.png diff --git a/docs/data-guide/concepts/images/big-data-pipeline.png b/docs/data-guide/big-data/images/big-data-pipeline.png similarity index 100% rename from docs/data-guide/concepts/images/big-data-pipeline.png rename to docs/data-guide/big-data/images/big-data-pipeline.png diff --git a/docs/data-guide/concepts/images/document.png b/docs/data-guide/big-data/images/document.png similarity index 100% rename from docs/data-guide/concepts/images/document.png rename to docs/data-guide/big-data/images/document.png diff --git a/docs/data-guide/concepts/images/kappa.png b/docs/data-guide/big-data/images/kappa.png similarity index 100% rename from docs/data-guide/concepts/images/kappa.png rename to docs/data-guide/big-data/images/kappa.png diff --git a/docs/data-guide/concepts/images/lambda.png b/docs/data-guide/big-data/images/lambda.png similarity index 100% rename from docs/data-guide/concepts/images/lambda.png rename to docs/data-guide/big-data/images/lambda.png diff --git a/docs/data-guide/concepts/images/machine-learning-model-training-and-deployment.png b/docs/data-guide/big-data/images/machine-learning-model-training-and-deployment.png similarity index 100% rename from docs/data-guide/concepts/images/machine-learning-model-training-and-deployment.png rename to docs/data-guide/big-data/images/machine-learning-model-training-and-deployment.png diff --git a/docs/data-guide/concepts/images/object.png b/docs/data-guide/big-data/images/object.png similarity index 100% rename from docs/data-guide/concepts/images/object.png rename to docs/data-guide/big-data/images/object.png diff --git a/docs/data-guide/scenarios/images/real-time-pipeline.png b/docs/data-guide/big-data/images/real-time-pipeline.png similarity index 100% rename from docs/data-guide/scenarios/images/real-time-pipeline.png rename to docs/data-guide/big-data/images/real-time-pipeline.png diff --git a/docs/data-guide/concepts/images/search.png b/docs/data-guide/big-data/images/search.png similarity index 100% rename from docs/data-guide/concepts/images/search.png rename to docs/data-guide/big-data/images/search.png diff --git a/docs/data-guide/concepts/images/time-series.png b/docs/data-guide/big-data/images/time-series.png similarity index 100% rename from docs/data-guide/concepts/images/time-series.png rename to docs/data-guide/big-data/images/time-series.png diff --git a/docs/data-guide/concepts/big-data.md b/docs/data-guide/big-data/index.md similarity index 100% rename from docs/data-guide/concepts/big-data.md rename to docs/data-guide/big-data/index.md diff --git a/docs/data-guide/concepts/machine-learning-at-scale.md b/docs/data-guide/big-data/machine-learning-at-scale.md similarity index 100% rename from docs/data-guide/concepts/machine-learning-at-scale.md rename to docs/data-guide/big-data/machine-learning-at-scale.md diff --git a/docs/data-guide/concepts/non-relational-data.md b/docs/data-guide/big-data/non-relational-data.md similarity index 97% rename from docs/data-guide/concepts/non-relational-data.md rename to docs/data-guide/big-data/non-relational-data.md index f9ed743cb11..1135d085866 100644 --- a/docs/data-guide/concepts/non-relational-data.md +++ b/docs/data-guide/big-data/non-relational-data.md @@ -9,7 +9,7 @@ ms:date: 02/12/2018 A *non-relational database* is a database that does not use the tabular schema of rows and columns found in most traditional database systems. Instead, non-relational databases use a storage model that is optimized for the specific requirements of the type of data being stored. For example, data may be stored as simple key/value pairs, as JSON documents, or as a graph consisting of edges and vertices. -What all of these data stores have in common is that they don't use a [relational model](./relational-data.md). Also, they tend to be more specific in the type of data they support and how data can be queried. For example, time series data stores are optimized for queries over time-based sequences of data, while graph data stores are optimized for exploring weighted relationships between entities. Neither format would generalize well to the task of managing transactional data. +What all of these data stores have in common is that they don't use a [relational model](../relational-data/index.md). Also, they tend to be more specific in the type of data they support and how data can be queried. For example, time series data stores are optimized for queries over time-based sequences of data, while graph data stores are optimized for exploring weighted relationships between entities. Neither format would generalize well to the task of managing transactional data. The term *NoSQL* refers to data stores that do not use SQL for queries, and instead use other programming languages and constructs to query the data. In practice, "NoSQL" means "non-relational database," even though many of these databases do support SQL-compatible queries. However, the underlying query execution strategy is usually very different from the way a traditional RDBMS would execute the same SQL query. diff --git a/docs/data-guide/scenarios/real-time-processing.md b/docs/data-guide/big-data/real-time-processing.md similarity index 98% rename from docs/data-guide/scenarios/real-time-processing.md rename to docs/data-guide/big-data/real-time-processing.md index 053b86870ca..39abc105c07 100644 --- a/docs/data-guide/scenarios/real-time-processing.md +++ b/docs/data-guide/big-data/real-time-processing.md @@ -45,7 +45,7 @@ For more information, see [Real-time message ingestion](../technology-choices/re ### Data storage -- **Azure Storage Blob Containers** or **Azure Data Lake Store**. Incoming real-time data is usually captured in a message broker (see above), but in some scenarios, it can make sense to monitor a folder for new files and process them as they are created or updated. Additionally, many real-time processing solutions combine streaming data with static reference data, which can be stored in a file store. Finally, file storage may be used as an output destination for captured real-time data for archiving, or for further batch processing in a [lambda architecture](../concepts/big-data.md#lambda-architecture). +- **Azure Storage Blob Containers** or **Azure Data Lake Store**. Incoming real-time data is usually captured in a message broker (see above), but in some scenarios, it can make sense to monitor a folder for new files and process them as they are created or updated. Additionally, many real-time processing solutions combine streaming data with static reference data, which can be stored in a file store. Finally, file storage may be used as an output destination for captured real-time data for archiving, or for further batch processing in a [lambda architecture](../big-data/index.md#lambda-architecture). For more information, see [Data storage](../technology-choices/data-storage.md). diff --git a/docs/data-guide/concepts/images/data-pipeline-ml.png b/docs/data-guide/concepts/images/data-pipeline-ml.png deleted file mode 100644 index 12444304d16..00000000000 Binary files a/docs/data-guide/concepts/images/data-pipeline-ml.png and /dev/null differ diff --git a/docs/data-guide/concepts/images/machine-learning-model-options.png b/docs/data-guide/concepts/images/machine-learning-model-options.png deleted file mode 100644 index f2f51b59a71..00000000000 Binary files a/docs/data-guide/concepts/images/machine-learning-model-options.png and /dev/null differ diff --git a/docs/data-guide/concepts/images/three-tier-application.png b/docs/data-guide/concepts/images/three-tier-application.png deleted file mode 100644 index 8e9dd2c2b43..00000000000 Binary files a/docs/data-guide/concepts/images/three-tier-application.png and /dev/null differ diff --git a/docs/data-guide/concepts/semantic-modeling.md b/docs/data-guide/concepts/semantic-modeling.md deleted file mode 100644 index 6ecbe94f2d7..00000000000 --- a/docs/data-guide/concepts/semantic-modeling.md +++ /dev/null @@ -1,61 +0,0 @@ ---- -title: Semantic modeling -description: -author: zoinerTejada -ms:date: 02/12/2018 ---- - -# Semantic modeling - -A semantic data model is a conceptual model that describes the meaning of the data elements it contains. Organizations often have their own terms for things, sometimes with synonyms, or even different meanings for the same term. For example, an inventory database might track a piece of equipment with an asset ID and a serial number, but a sales database might refer to the serial number as the asset ID. There is no simple way to relate these values without a model that describes the relationship. - -Semantic modeling provides a level of abstraction over the database schema, so that users don't need to know the underlying data structures. This makes it easier for end users to query data without performing aggregates and joins over the underlying schema. Also, usually columns are renamed to more user-friendly names, so that the context and meaning of the data are more obvious. - -Semantic modeling is predominately used for read-heavy scenarios, such as analytics and business intelligence (OLAP), as opposed to more write-heavy transactional data processing (OLTP). This is mostly due to the nature of a typical semantic layer: - -- Aggregation behaviors are set so that reporting tools display them properly. -- Business logic and calculations are defined. -- Time-oriented calculations are included. -- Data is often integrated from multiple sources. - -Traditionally, the semantic layer is placed over a data warehouse for these reasons. - -![Example diagram of a semantic layer between a data warehouse and a reporting tool](./images/semantic-modeling.png) - -There are two primary types of semantic models: - -* **Tabular**. Uses relational modeling constructs (model, tables, columns). Internally, metadata is inherited from OLAP modeling constructs (cubes, dimensions, measures). Code and script use OLAP metadata. -* **Multidimensional**. Uses traditional OLAP modeling constructs (cubes, dimensions, measures). - -Relevant Azure service: -- [Azure Analysis Services](https://azure.microsoft.com/services/analysis-services/) - -## Example use case - -An organization has data stored in a large database. It wants to make this data available to business users and customers to create their own reports and do some analysis. One option is just to give those users direct access to the database. However, there are several drawbacks to doing this, including managing security and controlling access. Also, the design of the database, including the names of tables and columns, may be hard for a user to understand. Users would need to know which tables to query, how those tables should be joined, and other business logic that must be applied to get the correct results. Users would also need to know a query language like SQL even to get started. Typically this leads to multiple users reporting the same metrics but with different results. - -Another option is to encapsulate all of the information that users need into a semantic model. The semantic model can be more easily queried by users with a reporting tool of their choice. The data provided by the semantic model is pulled from a data warehouse, ensuring that all users see a single version of the truth. The semantic model also provides friendly table and column names, relationships between tables, descriptions, calculations, and row-level security. - -## Typical traits of semantic modeling - -Semantic modeling and analytical processing tends to have the following traits: - -| Requirement | Description | -| --- | --- | -| Schema | Schema on write, strongly enforced| -| Uses Transactions | No | -| Locking Strategy | None | -| Updateable | No (typically requires recomputing cube) | -| Appendable | No (typically requires recomputing cube) | -| Workload | Heavy reads, read-only | -| Indexing | Multidimensional indexing | -| Datum size | Small to medium sized | -| Model | Multidimensional | -| Data shape:| Cube or star/snowflake schema | -| Query flexibility | Highly flexible | -| Scale: | Large (10s-100s GBs) | - -## See also - -- [Data warehousing](../scenarios/data-warehousing.md) -- [Online analytical processing (OLAP)](../scenarios/online-analytical-processing.md) diff --git a/docs/data-guide/concepts/transactional-data.md b/docs/data-guide/concepts/transactional-data.md deleted file mode 100644 index 7029d52ca66..00000000000 --- a/docs/data-guide/concepts/transactional-data.md +++ /dev/null @@ -1,44 +0,0 @@ ---- -title: Transactional data -description: -author: zoinerTejada -ms:date: 02/12/2018 ---- - -# Transactional data - -Transactional data is information that tracks the interactions related to an organization's activities. These interactions are typically business transactions, such as payments received from customers, payments made to suppliers, products moving through inventory, orders taken, or services delivered. Transactional events, which represent the transactions themselves, typically contain a time dimension, some numerical values, and references to other data. - -Transactions typically need to be *atomic* and *consistent*. Atomicity means that an entire transaction always succeeds or fails as one unit of work, and is never left in a half-completed state. If a transaction cannot be completed, the database system must roll back any steps that were already done as part of that transaction. In a traditional RDBMS, this rollback happens automatically if a transaction cannot be completed. Consistency means that transactions always leave the data in a valid state. (These are very informal descriptions of atomicity and consistency. There are more formal definitions of these properties, such as [ACID](https://en.wikipedia.org/wiki/ACID).) - -Transactional databases can support strong consistency for transactions using various locking strategies, such as pessimistic locking, to ensure that all data is strongly consistent within the context of the enterprise, for all users and processes. - -The most common deployment architecture that uses transactional data is the data store tier in a 3-tier architecture. A 3-tier architecture typically consists of a presentation tier, business logic tier, and data store tier. A related deployment architecture is the [N-tier](/azure/architecture/guide/architecture-styles/n-tier) architecture, which may have multiple middle-tiers handling business logic. - -![Example of a 3-tier application](./images/three-tier-application.png) - -## Typical traits of transactional data - -Transactional data tends to have the following traits: - -| Requirement | Description | -| --- | --- | -| Normalization | Highly normalized | -| Schema | Schema on write, strongly enforced| -| Consistency | Strong consistency, ACID guarantees | -| Integrity | High integrity | -| Uses transactions | Yes | -| Locking strategy | Pessimistic or optimistic| -| Updateable | Yes | -| Appendable | Yes | -| Workload | Heavy writes, moderate reads | -| Indexing | Primary and secondary indexes | -| Datum size | Small to medium sized | -| Model | Relational | -| Data shape | Tabular | -| Query flexibility | Highly flexible | -| Scale | Small (MBs) to Large (a few TBs) | - -## See Also - -[Online Transaction Processing](../scenarios/online-transaction-processing.md) diff --git a/docs/data-guide/scenarios/images/control-flow-data-flow.png b/docs/data-guide/images/control-flow-data-flow.png similarity index 100% rename from docs/data-guide/scenarios/images/control-flow-data-flow.png rename to docs/data-guide/images/control-flow-data-flow.png diff --git a/docs/data-guide/scenarios/images/data-warehousing.png b/docs/data-guide/images/data-warehousing.png similarity index 100% rename from docs/data-guide/scenarios/images/data-warehousing.png rename to docs/data-guide/images/data-warehousing.png diff --git a/docs/data-guide/scenarios/images/elt.png b/docs/data-guide/images/elt.png similarity index 100% rename from docs/data-guide/scenarios/images/elt.png rename to docs/data-guide/images/elt.png diff --git a/docs/data-guide/scenarios/images/etl.png b/docs/data-guide/images/etl.png similarity index 100% rename from docs/data-guide/scenarios/images/etl.png rename to docs/data-guide/images/etl.png diff --git a/docs/data-guide/concepts/images/example-relational.png b/docs/data-guide/images/example-relational.png similarity index 100% rename from docs/data-guide/concepts/images/example-relational.png rename to docs/data-guide/images/example-relational.png diff --git a/docs/data-guide/concepts/images/example-relational2.png b/docs/data-guide/images/example-relational2.png similarity index 100% rename from docs/data-guide/concepts/images/example-relational2.png rename to docs/data-guide/images/example-relational2.png diff --git a/docs/data-guide/images/guide-big-data.svg b/docs/data-guide/images/guide-big-data.svg new file mode 100644 index 00000000000..a1090814f59 --- /dev/null +++ b/docs/data-guide/images/guide-big-data.svg @@ -0,0 +1,581 @@ + + + + + + + + + + Page-1 + + + Sheet.51 + + Sheet.4 + Storage + + + + Storage + + Sheet.3 + Processing + + + + Processing + + Sheet.2 + Ingestion + + + + Ingestion + + Sheet.21 + ML + + + + ML + + Sheet.22 + Reporting + + + + Reporting + + Sheet.50 + + + + Sheet.49 + + + + Event Hubs + + Sheet.8 + + + + Sheet.9 + + + + Sheet.10 + + + + Sheet.11 + + + + Sheet.12 + + + + Sheet.13 + + + + Sheet.14 + + + + Sheet.15 + + + + + Stream Analytics + + Sheet.17 + + + + Sheet.18 + + + + Sheet.19 + + + + Sheet.20 + + + + + Power BI (nc) + + Sheet.24 + + + + Sheet.25 + + + + Sheet.26 + + + + Sheet.27 + + + + Sheet.28 + + + + + Cognitive Services (opaque) (nc) + + Sheet.30 + + + + Sheet.31 + + + + + HDInsight (opaque) + + Sheet.34 + + + + + Sheet.35 + + Sheet.36 + + + + Sheet.37 + + + + Sheet.38 + + + + + + Cosmos DB + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Storage blob (opaque) + + Sheet.41 + + + + Sheet.42 + + Sheet.43 + + + + Sheet.44 + + + + Sheet.45 + + + + Sheet.46 + + + + + + Data Lake Store (nc) + + + + + diff --git a/docs/data-guide/images/guide-rdbms.svg b/docs/data-guide/images/guide-rdbms.svg new file mode 100644 index 00000000000..5d45a743880 --- /dev/null +++ b/docs/data-guide/images/guide-rdbms.svg @@ -0,0 +1,267 @@ + + + + + + + + Page-1 + + Sheet.97 + + Sheet.1 + Data Warehouse + + Data Warehouse + + Sheet.2 + Analysis + + Analysis + + Sheet.3 + + + + Sheet.4 + Reporting + + Reporting + + Sheet.5 + OLTP + + OLTP + + Data Factory (opaque) + + Sheet.15 + + + + Sheet.16 + + + + + Power BI (nc).17 + + Sheet.18 + + + + Sheet.19 + + + + Sheet.20 + + + + Sheet.21 + + + + Sheet.22 + + + + + Analysis Service + + Sheet.24 + + + + + SQL database (generic) (opaque) + + Sheet.32 + + + + Sheet.33 + + Sheet.34 + + + + Sheet.35 + + + + + + Sheet.37 + + + + Sheet.38 + + + + SQL Data Warehouse (nc) + + + + Triangle.94 + + + + Sheet.96 + ETL + + ETL + + + diff --git a/docs/data-guide/images/guide-steps.svg b/docs/data-guide/images/guide-steps.svg deleted file mode 100644 index 99333077853..00000000000 --- a/docs/data-guide/images/guide-steps.svg +++ /dev/null @@ -1,239 +0,0 @@ - - - - - - Page-1 - - Sheet.98 - - Sheet.71 - RDBMS - - RDBMS - - - Simple Arrow.83 - - - - Simple Arrow.84 - - - - Sheet.94 - Technology choices - - Technology - choices - - - Sheet.95 - Solutions - - Scenarios - - - Sheet.96 - Concepts - - Concepts - - - - Sheet.99 - - Sheet.100 - NoSQL - - NoSQL - - - Simple Arrow.83 - - - - Simple Arrow.84 - - - - Sheet.103 - Technology choices - - Technology - choices - - - Sheet.104 - Solutions - - Scenarios - - - Sheet.105 - Concepts - - Concepts - - - - Sheet.46 - - Sheet.14 - - - - Sheet.15 - - - - Sheet.16 - - - - Sheet.17 - - - - Sheet.18 - - - - Sheet.19 - - - - Sheet.20 - - - - Sheet.21 - - - - Sheet.30 - - - - Sheet.31 - - - - Sheet.32 - - - - Sheet.33 - - - - Sheet.34 - - - - Sheet.35 - - - - Sheet.36 - - - - Sheet.37 - - - - Sheet.38 - - - - Sheet.39 - - - - Sheet.40 - - - - Sheet.41 - - - - Sheet.42 - - - - Sheet.43 - - - - Sheet.44 - - - - Sheet.45 - - - - - Sheet.61 - - Sheet.47 - - - - Sheet.48 - - - - Sheet.49 - - - - Sheet.51 - - - - Sheet.53 - - - - Sheet.54 - - - - Sheet.55 - - - - Sheet.56 - - - - Sheet.57 - - - - Sheet.58 - - - - Sheet.60 - - - - - diff --git a/docs/data-guide/scenarios/images/olap-data-pipeline.png b/docs/data-guide/images/olap-data-pipeline.png similarity index 100% rename from docs/data-guide/scenarios/images/olap-data-pipeline.png rename to docs/data-guide/images/olap-data-pipeline.png diff --git a/docs/data-guide/concepts/images/semantic-modeling.png b/docs/data-guide/images/semantic-modeling.png similarity index 100% rename from docs/data-guide/concepts/images/semantic-modeling.png rename to docs/data-guide/images/semantic-modeling.png diff --git a/docs/data-guide/index.md b/docs/data-guide/index.md index e443e24707a..327b7c496bb 100644 --- a/docs/data-guide/index.md +++ b/docs/data-guide/index.md @@ -16,58 +16,16 @@ The cloud is changing the way applications are designed, including how data is p ## How this guide is structured -This guide is structured around a basic pivot: The distinction between *relational* data and *non-relational* data. +This guide is structured around two general categories of data solution, *traditional RDMBS workloads* and *big data solutions*. -![](./images/guide-steps.svg) +**[Traditional RDBMS workloads](./relational-data/index.md)**. These workloads include online transaction processing (OLTP) and online analytical processing (OLAP). Data in OLTP systems is typically relational data with a pre-defined schema and a set of constraints to maintain referential integrity. Often, data from multiple sources in the organization may be consolidated into a data warehouse, using an ETL process to move and transform the source data. -Relational data is generally stored in a traditional RDBMS or a data warehouse. It has a pre-defined schema ("schema on write") with a set of constraints to maintain referential integrity. Most relational databases use Structured Query Language (SQL) for querying. Solutions that use relational databases include online transaction processing (OLTP) and online analytical processing (OLAP). +![](./images/guide-rdbms.svg) -Non-relational data is any data that does not use the [relational model](https://en.wikipedia.org/wiki/Relational_model) found in traditional RDBMS systems. This may include key-value data, JSON data, graph data, time series data, and other data types. The term *NoSQL* refers to databases that are designed to hold various types of non-relational data. However, the term is not entirely accurate, because many non-relational data stores support SQL compatible queries. Non-relational data and NoSQL databases often come up in discussions of *big data* solutions. A big data architecture is designed to handle the ingestion, processing, and analysis of data that is too large or complex for traditional database systems. +**[Big data solutions](./big-data/index.md)**. A big data architecture is designed to handle the ingestion, processing, and analysis of data that is too large or complex for traditional database systems. The data may be processed in batch or in real tme. Big data solutions typically involve a large amount of non-relational data, such as key-value data, JSON documents, or time series data. Often traditional RDBMS systems are not well-suited to store this type of data. The term *NoSQL* refers to a family of databases designed to hold non-relational data. (The term isn't quite accurate, because many non-relational data stores support SQL compatible queries.) -Within each of these two main categories, the Data Architecture Guide contains the following sections: +![](./images/guide-big-data.svg) -- **Concepts.** Overview articles that introduce the main concepts you need to understand when working with this type of data. -- **Scenarios.** A representative set of data scenarios, including a discussion of the relevant Azure services and the appropriate architecture for the scenario. -- **Technology choices.** Detailed comparisons of various data technologies available on Azure, including open source options. Within each category, we describe the key selection criteria and a capability matrix, to help you choose the right technology for your scenario. +These two categories are not mutually exclusive, and there is overlap between them, but we feel that it's a useful way to frame the discussion. Within each category, the guide discusses **common scenarios**, including relevant Azure services and the appropriate architecture for the scenario. In addition, the guide compares **technology choices** for data solutions in Azure, including open source options. Within each category, we describe the key selection criteria and a capability matrix, to help you choose the right technology for your scenario. This guide is not intended to teach you data science or database theory — you can find entire books on those subjects. Instead, the goal is to help you select the right data architecture or data pipeline for your scenario, and then select the Azure services and technologies that best fit your requirements. If you already have an architecture in mind, you can skip directly to the technology choices. - -## Traditional RDBMS - -### Concepts - -- [Relational data](./concepts/relational-data.md) -- [Transactional data](./concepts/transactional-data.md) -- [Semantic modeling](./concepts/semantic-modeling.md) - -### Scenarios - -- [Online analytical processing (OLAP)](./scenarios/online-analytical-processing.md) -- [Online transaction processing (OLTP)](./scenarios/online-transaction-processing.md) -- [Data warehousing and data marts](./scenarios/data-warehousing.md) -- [ETL](./scenarios/etl.md) - -## Big data and NoSQL - -### Concepts - -- [Non-relational data stores](./concepts/non-relational-data.md) -- [Working with CSV and JSON files](./concepts/csv-and-json.md) -- [Big data architectures](./concepts/big-data.md) -- [Advanced analytics](./concepts/advanced-analytics.md) -- [Machine learning at scale](./concepts/machine-learning-at-scale.md) - -### Scenarios - -- [Batch processing](./scenarios/batch-processing.md) -- [Real time processing](./scenarios/real-time-processing.md) -- [Free-form text search](./scenarios/search.md) -- [Interactive data exploration](./scenarios/interactive-data-exploration.md) -- [Natural language processing](./scenarios/natural-language-processing.md) -- [Time series solutions](./scenarios/time-series.md) - -## Cross-cutting concerns - -- [Data transfer](./scenarios/data-transfer.md) -- [Extending on-premises data solutions to the cloud](./scenarios/hybrid-on-premises-and-cloud.md) -- [Securing data solutions](./scenarios/securing-data-solutions.md) diff --git a/docs/data-guide/technology-choices/data-warehouses.md b/docs/data-guide/relational-data/data-warehousing.md similarity index 65% rename from docs/data-guide/technology-choices/data-warehouses.md rename to docs/data-guide/relational-data/data-warehousing.md index dd648f6892c..1f93c3f8635 100644 --- a/docs/data-guide/technology-choices/data-warehouses.md +++ b/docs/data-guide/relational-data/data-warehousing.md @@ -1,18 +1,49 @@ --- -title: Choosing a data warehouse +title: Data warehousing and data marts description: author: zoinerTejada ms:date: 02/12/2018 --- -# Choosing a data warehouse in Azure +# Data warehousing and data marts -A data warehouse is a central, organizational, relational repository of integrated data from one or more disparate sources. This topic compares options for data warehouses in Azure. +A data warehouse is a central, organizational, relational repository of integrated data from one or more disparate sources, across many or all subject areas. Data warehouses store current and historical data and are used for reporting and analysis of the data in different ways. -> [!NOTE] -> For more information about when to use a data warehouse, see [Data warehousing and data marts](../scenarios/data-warehousing.md). +![Data warehousing in Azure](../images/data-warehousing.png) -## What are your options when choosing a data warehouse? +To move data into a data warehouse, it is extracted on a periodic basis from various sources that contain important business information. As the data is moved, it can be formatted, cleaned, validated, summarized, and reorganized. Alternately, the data can be stored in the lowest level of detail, with aggregated views provided in the warehouse for reporting. In either case, the data warehouse becomes a permanent storage space for data used for reporting, analysis, and forming important business decisions using business intelligence (BI) tools. + +## Data marts and operational data stores + +Managing data at scale is complex, and it is becoming less common to have a single data warehouse that represents all data across the entire enterprise. Instead, organizations create smaller, more focused data warehouses, called *data marts*, that expose the desired data for analytics purposes. An orchestration process populates the data marts from data maintained in an operational data store. The operational data store acts as an intermediary between the source transactional system and the data mart. Data managed by the operational data store is a cleaned version of the data present in the source transactional system, and is typically a subset of the historical data that is maintained by the data warehouse or data mart. + +## When to use this solution + +Choose a data warehouse when you need to turn massive amounts of data from operational systems into a format that is easy to understand, current, and accurate. Data warehouses do not need to follow the same terse data structure you may be using in your operational/OLTP databases. You can use column names that make sense to business users and analysts, restructure the schema to simplify data relationships, and consolidate several tables into one. These steps help guide users who need to create ad hoc reports, or create reports and analyze the data in BI systems, without the help of a database administrator (DBA) or data developer. + +Consider using a data warehouse when you need to keep historical data separate from the source transaction systems for performance reasons. Data warehouses make it easy to access historical data from multiple locations, by providing a centralized location using common formats, common keys, common data models, and common access methods. + +Data warehouses are optimized for read access, resulting in faster report generation compared to running reports against the source transaction system. In addition, data warehouses provide the following benefits: + +* All historical data from multiple sources can be stored and accessed from a data warehouse as the single source of truth. +* You can improve data quality by cleaning up data as it is imported into the data warehouse, providing more accurate data as well as providing consistent codes and descriptions. +* Reporting tools do not compete with the transactional source systems for query processing cycles. A data warehouse allows the transactional system to focus predominantly on handling writes, while the data warehouse satisfies the majority of read requests. +* A data warehouse can help consolidate data from different software. +* Data mining tools can help you find hidden patterns using automatic methodologies against data stored in your warehouse. +* Data warehouses make it easier to provide secure access to authorized users, while restricting access to others. There is no need to grant business users access to the source data, thereby removing a potential attack vector against one or more production transaction systems. +* Data warehouses make it easier to create business intelligence solutions on top of the data, such as [OLAP cubes](online-analytical-processing.md). + +## Challenges + +Properly configuring a data warehouse to fit the needs of your business can bring some of the following challenges: + +* Committing the time required to properly model your business concepts. This is an important step, as data warehouses are information driven, where concept mapping drives the rest of the project. This involves standardizing business-related terms and common formats (such as currency and dates), and restructuring the schema in a way that makes sense to business users but still ensures accuracy of data aggregates and relationships. +* Planning and setting up your data orchestration. Consideration include how to copy data from the source transactional system to the data warehouse, and when to move historical data out of your operational data stores and into the warehouse. +* Maintaining or improving data quality by cleaning the data as it is imported into the warehouse. + +## Data warehousing in Azure + +In Azure, you may have one or more sources of data, whether from customer transactions, or from various business applications used by various departments. This data is traditionally stored in one or more [OLTP](online-transaction-processing.md) databases. The data could be persisted in other storage mediums such as network shares, Azure Storage Blobs, or a data lake. The data could also be stored by the data warehouse itself or in a relational database such as Azure SQL Database. The purpose of the analytical data store layer is to satisfy queries issued by analytics and reporting tools against the data warehouse or data mart. In Azure, this analytical store capability can be met with Azure SQL Data Warehouse, or with Azure HDInsight using Hive or Interactive Query. In addition, you will need some level of orchestration to periodically move or copy data from data storage to the data warehouse, which can be done using Azure Data Factory or Oozie on Azure HDInsight. There are several options for implementing a data warehouse in Azure, depending on your needs. The following lists are broken into two categories, [symmetric multiprocessing](https://en.wikipedia.org/wiki/Symmetric_multiprocessing) (SMP) and [massively parallel processing](https://en.wikipedia.org/wiki/Massively_parallel) (MPP). @@ -27,7 +58,7 @@ MPP: - [Apache Hive on HDInsight](/azure/hdinsight/hadoop/hdinsight-use-hive) - [Interactive Query (Hive LLAP) on HDInsight](/azure/hdinsight/interactive-query/apache-interactive-query-get-started) -As a general rule, SMP-based warehouses are best suited for small to medium data sets (up to 4-100 TB), while MPP is often used for big data. The delineation between small/medium and big data partly has to do with your organization's definition and supporting infrastructure. (See [Choosing an OLTP data store](oltp-data-stores.md#scalability-capabilities).) +As a general rule, SMP-based warehouses are best suited for small to medium data sets (up to 4-100 TB), while MPP is often used for big data. The delineation between small/medium and big data partly has to do with your organization's definition and supporting infrastructure. (See [Choosing an OLTP data store](online-transaction-processing.md#scalability-capabilities).) Beyond data sizes, the type of workload pattern is likely to be a greater determining factor. For example, complex queries may be too slow for an SMP solution, and require an MPP solution instead. MPP-based systems are likely to impose a performance penalty with small data sizes, due to the way jobs are distributed and consolidated across nodes. If your data sizes already exceed 1 TB and are expected to continually grow, consider selecting an MPP solution. However, if your data sizes are less than this, but your workloads are exceeding the available resources of your SMP solution, then MPP may be your best option as well. @@ -56,7 +87,7 @@ To narrow the choices, start by answering these questions: - For a large data set, is the data source structured or unstructured? Unstructured data may need to be processed in a big data environment such as Spark on HDInsight, Azure Databricks, Hive LLAP on HDInsight, or Azure Data Lake Analytics. All of these can serve as ELT (Extract, Load, Transform) and ETL (Extract, Transform, Load) engines. They can output the processed data into structured data, making it easier to load into SQL Data Warehouse or one of the other options. For structured data, SQL Data Warehouse has a performance tier called Optimized for Compute, for compute-intensive workloads requiring ultra-high performance. -- Do you want to separate your historical data from your current, operational data? If so, select one of the options where [orchestration](pipeline-orchestration-data-movement.md) is required. These are standalone warehouses optimized for heavy read access, and are best suited as a separate historical data store. +- Do you want to separate your historical data from your current, operational data? If so, select one of the options where [orchestration](../technology-choices/pipeline-orchestration-data-movement.md) is required. These are standalone warehouses optimized for heavy read access, and are best suited as a separate historical data store. - Do you need to integrate data from several sources, beyond your OLTP data store? If so, consider options that easily integrate multiple data sources. diff --git a/docs/data-guide/scenarios/etl.md b/docs/data-guide/relational-data/etl.md similarity index 93% rename from docs/data-guide/scenarios/etl.md rename to docs/data-guide/relational-data/etl.md index f7a9edd0939..426e7d00594 100644 --- a/docs/data-guide/scenarios/etl.md +++ b/docs/data-guide/relational-data/etl.md @@ -17,7 +17,7 @@ Extract, transform, and load (ETL) is a data pipeline used to collect data from The data transformation that takes place usually involves various operations, such as filtering, sorting, aggregating, joining data, cleaning data, deduplicating, and validating data. -![Extract-transform-load (ETL) process](./images/etl.png) +![Extract-transform-load (ETL) process](../images/etl.png) Often, the three ETL phases are run in parallel to save time. For example, while data is being extracted, a transformation process could be working on data already received and prepare it for loading, and a loading process can begin working on the prepared data, rather than waiting for the entire extraction process to complete. @@ -31,7 +31,7 @@ Other tools: Extract, load, and transform (ELT) differs from ETL solely in where the transformation takes place. In the ELT pipeline, the transformation occurs in the target data store. Instead of using a separate transformation engine, the processing capabilities of the target data store are used to transform data. This simplifies the architecture by removing the transformation engine from the pipeline. Another benefit to this approach is that scaling the target data store also scales the ELT pipeline performance. However, ELT only works well when the target system is powerful enough to transform the data efficiently. -![Extract-Load-Transform (ELT) process](./images/elt.png) +![Extract-Load-Transform (ELT) process](../images/elt.png) Typical use cases for ELT fall within the big data realm. For example, you might start by extracting all of the source data to flat files in scalable storage such as Hadoop distributed file system (HDFS) or Azure Data Lake Store. Technologies such as Spark, Hive, or PolyBase can then be used to query the source data. The key point with ELT is that the data store used to perform the transformation is the same data store where the data is ultimately consumed. This data store reads directly from the scalable storage, instead of loading the data into its own proprietary storage. This approach skips the data copy step present in ETL, which can be a time consuming operation for large data sets. @@ -58,7 +58,7 @@ In the context of data pipelines, the control flow ensures orderly processing of Control flows execute data flows as a task. In a data flow task, data is extracted from a source, transformed, or loaded into a data store. The output of one data flow task can be the input to the next data flow task, and data flowss can run in parallel. Unlike control flows, you cannot add constraints between tasks in a data flow. You can, however, add a data viewer to observe the data as it is processed by each task. -![Data Flow being executed as a task within a Control Flow](./images/control-flow-data-flow.png) +![Data Flow being executed as a task within a Control Flow](../images/control-flow-data-flow.png) In the diagram above, there are several tasks within the control flow, one of which is a data flow task. One of the tasks is nested within a container. Containers can be used to provide structure to tasks, providing a unit of work. One such example is for repeating elements within a collection, such as files in a folder or database statements. @@ -70,7 +70,7 @@ Other tools: ## Technology choices -- [Online Transaction Processing (OLTP) data stores](../technology-choices/oltp-data-stores.md) -- [Online Analytical Processing (OLAP) data stores](../technology-choices/olap-data-stores.md) -- [Data warehouses](../technology-choices/data-warehouses.md) +- [Online Transaction Processing (OLTP) data stores](./online-transaction-processing.md#oltp-in-azure) +- [Online Analytical Processing (OLAP) data stores](./online-analytical-processing.md#olap-in-azure) +- [Data warehouses](./data-warehousing.md) - [Pipeline orchestration](../technology-choices/pipeline-orchestration-data-movement.md) diff --git a/docs/data-guide/concepts/relational-data.md b/docs/data-guide/relational-data/index.md similarity index 92% rename from docs/data-guide/concepts/relational-data.md rename to docs/data-guide/relational-data/index.md index db63a11aa54..ec0d43686dc 100644 --- a/docs/data-guide/concepts/relational-data.md +++ b/docs/data-guide/relational-data/index.md @@ -5,17 +5,17 @@ author: zoinerTejada ms:date: 02/12/2018 --- -# Relational data +# Traditional relational database solutions Relational data is data modeled using the relational model. In this model, data is expressed as tuples. A *tuple* is a set of attribute/value pairs. For example, a tuple might be (itemid = 5, orderid = 1, item = "Chair", amount = 200.00). A set of tuples that all share the same attributes is called a *relation*. Relations are naturally represented as tables, where each tuple is exposed as a row in the table. However, rows have an explicit ordering, unlike tuples. The database schema defines the columns (headings) of each table. Each column is defined with a name and a data type for all values stored in that column across all rows in the table. -![Example showing data using a relational database](./images/example-relational.png) +![Example showing data using a relational database](../images/example-relational.png) A data store that organizes data using the relational model is referred to as a relational database. Primary keys uniquely identify rows within a table. Foreign key fields are used in one table to refer to a row in another table by referencing the primary key of the other table. Foreign keys are used to maintain referential integrity, ensuring that the referenced rows are not altered or deleted while the referencing row depends on them. -![Example showing data using a relational database](./images/example-relational2.png) +![Example showing data using a relational database](../images/example-relational2.png) Relational databases support various types of constraints that help to ensure data integrity: @@ -31,4 +31,4 @@ To improve query performance, relational databases use *indexes*. Primary indexe Because relational databases enforce referential integrity, scaling a relational database can become challenging. That's because any query or insert operation might touch any number of tables. You can scale out a relational database by *sharding* the data, but this requires careful design of the schema. For more information, see [Sharding pattern](../../patterns/sharding.md). -If data is non-relational or has requirements that are not suited to a relational database, consider a [Non-relational or NoSQL](./non-relational-data.md) data store. +If data is non-relational or has requirements that are not suited to a relational database, consider a [Non-relational or NoSQL](../big-data/non-relational-data.md) data store. diff --git a/docs/data-guide/relational-data/online-analytical-processing.md b/docs/data-guide/relational-data/online-analytical-processing.md new file mode 100644 index 00000000000..a41aede0519 --- /dev/null +++ b/docs/data-guide/relational-data/online-analytical-processing.md @@ -0,0 +1,143 @@ +--- +title: Online analytical processing (OLAP) +description: +author: zoinerTejada +ms:date: 02/12/2018 +--- + +# Online analytical processing (OLAP) + +Online analytical processing (OLAP) is a technology that organizes large business databases and supports complex analysis. It can be used to perform complex analytical queries without negatively affecting transactional systems. + +The databases that a business uses to store all its transactions and records are called [online transaction processing (OLTP)](online-transaction-processing.md) databases. These databases usually have records that are entered one at a time. Often they contain a great deal of information that is valuable to the organization. The databases that are used for OLTP, however, were not designed for analysis. Therefore, retrieving answers from these databases is costly in terms of time and effort. OLAP systems were designed to help extract this business intelligence information from the data in a highly performant way. This is because OLAP databases are optimized for heavy read, low write workloads. + +![OLAP in Azure](../images/olap-data-pipeline.png) + +## Semantic modeling + +A semantic data model is a conceptual model that describes the meaning of the data elements it contains. Organizations often have their own terms for things, sometimes with synonyms, or even different meanings for the same term. For example, an inventory database might track a piece of equipment with an asset ID and a serial number, but a sales database might refer to the serial number as the asset ID. There is no simple way to relate these values without a model that describes the relationship. + +Semantic modeling provides a level of abstraction over the database schema, so that users don't need to know the underlying data structures. This makes it easier for end users to query data without performing aggregates and joins over the underlying schema. Also, usually columns are renamed to more user-friendly names, so that the context and meaning of the data are more obvious. + +Semantic modeling is predominately used for read-heavy scenarios, such as analytics and business intelligence (OLAP), as opposed to more write-heavy transactional data processing (OLTP). This is mostly due to the nature of a typical semantic layer: + +- Aggregation behaviors are set so that reporting tools display them properly. +- Business logic and calculations are defined. +- Time-oriented calculations are included. +- Data is often integrated from multiple sources. + +Traditionally, the semantic layer is placed over a data warehouse for these reasons. + +![Example diagram of a semantic layer between a data warehouse and a reporting tool](../images/semantic-modeling.png) + +There are two primary types of semantic models: + +* **Tabular**. Uses relational modeling constructs (model, tables, columns). Internally, metadata is inherited from OLAP modeling constructs (cubes, dimensions, measures). Code and script use OLAP metadata. +* **Multidimensional**. Uses traditional OLAP modeling constructs (cubes, dimensions, measures). + +Relevant Azure service: +- [Azure Analysis Services](https://azure.microsoft.com/services/analysis-services/) + +## Example use case + +An organization has data stored in a large database. It wants to make this data available to business users and customers to create their own reports and do some analysis. One option is just to give those users direct access to the database. However, there are several drawbacks to doing this, including managing security and controlling access. Also, the design of the database, including the names of tables and columns, may be hard for a user to understand. Users would need to know which tables to query, how those tables should be joined, and other business logic that must be applied to get the correct results. Users would also need to know a query language like SQL even to get started. Typically this leads to multiple users reporting the same metrics but with different results. + +Another option is to encapsulate all of the information that users need into a semantic model. The semantic model can be more easily queried by users with a reporting tool of their choice. The data provided by the semantic model is pulled from a data warehouse, ensuring that all users see a single version of the truth. The semantic model also provides friendly table and column names, relationships between tables, descriptions, calculations, and row-level security. + +## Typical traits of semantic modeling + +Semantic modeling and analytical processing tends to have the following traits: + +| Requirement | Description | +| --- | --- | +| Schema | Schema on write, strongly enforced| +| Uses Transactions | No | +| Locking Strategy | None | +| Updateable | No (typically requires recomputing cube) | +| Appendable | No (typically requires recomputing cube) | +| Workload | Heavy reads, read-only | +| Indexing | Multidimensional indexing | +| Datum size | Small to medium sized | +| Model | Multidimensional | +| Data shape:| Cube or star/snowflake schema | +| Query flexibility | Highly flexible | +| Scale: | Large (10s-100s GBs) | + +## When to use this solution + +Consider OLAP in the following scenarios: + +- You need to execute complex analytical and ad hoc queries rapidly, without negatively affecting your OLTP systems. +- You want to provide business users with a simple way to generate reports from your data +- You want to provide a number of aggregations that will allow users to get fast, consistent results. + +OLAP is especially useful for applying aggregate calculations over large amounts of data. OLAP systems are optimized for read-heavy scenarios, such as analytics and business intelligence. OLAP allows users to segment multi-dimensional data into slices that can be viewed in two dimensions (such as a pivot table) or filter the data by specific values. This process is sometimes called "slicing and dicing" the data, and can be done regardless of whether the data is partitioned across several data sources. This helps users to find trends, spot patterns, and explore the data without having to know the details of traditional data analysis. + +Semantic models can help business users abstract relationship complexities and make it easier to analyze data quickly. + +## Challenges + +For all the benefits OLAP systems provide, they do produce a few challenges: + +- Whereas data in OLTP systems is constantly updated through transactions flowing in from various sources, OLAP data stores are typically refreshed at a much slower intervals, depending on business needs. This means OLAP systems are better suited for strategic business decisions, rather than immediate responses to changes. Also, some level of data cleansing and orchestration needs to be planned to keep the OLAP data stores up-to-date. +- Unlike traditional, normalized, relational tables found in OLTP systems, OLAP data models tend to be multidimensional. This makes it difficult or impossible to directly map to entity-relationship or object-oriented models, where each attribute is mapped to one column. Instead, OLAP systems typically use a star or snowflake schema in place of traditional normalization. + +## OLAP in Azure + +In Azure, data held in OLTP systems such as Azure SQL Database is copied into the OLAP system, such as [Azure Analysis Services](/azure/analysis-services/analysis-services-overview). Data exploration and visualization tools like [Power BI](https://powerbi.microsoft.com), Excel, and third-party options connect to Analysis Services servers and provide users with highly interactive and visually rich insights into the modeled data. The flow of data from OLTP data to OLAP is typically orchestrated using SQL Server Integration Services, which can be executed using [Azure Data Factory](/azure/data-factory/concepts-integration-runtime). + +In Azure, all of the following data stores will meet the core requirements for OLAP: + +- [SQL Server with Columnstore indexes](/sql/relational-databases/indexes/get-started-with-columnstore-for-real-time-operational-analytics) +- [Azure Analysis Services](/azure/analysis-services/analysis-services-overview) +- [SQL Server Analysis Services (SSAS)](/sql/analysis-services/analysis-services) + +SQL Server Analysis Services (SSAS) offers OLAP and data mining functionality for business intelligence applications. You can either install SSAS on local servers, or host within a virtual machine in Azure. Azure Analysis Services is a fully managed service that provides the same major features as SSAS. Azure Analysis Services supports connecting to [various data sources](/azure/analysis-services/analysis-services-datasource) in the cloud and on-premises in your organization. + +Clustered Columnstore indexes are available in SQL Server 2014 and above, as well as Azure SQL Database, and are ideal for OLAP workloads. However, beginning with SQL Server 2016 (including Azure SQL Database), you can take advantage of hybrid transactional/analytics processing (HTAP) through the use of updateable nonclustered columnstore indexes. HTAP enables you to perform OLTP and OLAP processing on the same platform, which removes the need to store multiple copies of your data, and eliminates the need for distinct OLTP and OLAP systems. For more information, see [Get started with Columnstore for real-time operational analytics](/sql/relational-databases/indexes/get-started-with-columnstore-for-real-time-operational-analytics). + +## Key selection criteria + +To narrow the choices, start by answering these questions: + +- Do you want a managed service rather than managing your own servers? + +- Do you require secure authentication using Azure Active Directory (Azure AD)? + +- Do you want to conduct real-time analytics? If so, narrow your options to those that support real-time analytics. + + *Real-time analytics* in this context applies to a single data source, such as an enterprise resource planning (ERP) application, that will run both an operational and an analytics workload. If you need to integrate data from multiple sources, or require extreme analytics performance by using pre-aggregated data such as cubes, you might still require a separate data warehouse. + +- Do you need to use pre-aggregated data, for example to provide semantic models that make analytics more business user friendly? If yes, choose an option that supports multidimensional cubes or tabular semantic models. + + Providing aggregates can help users consistently calculate data aggregates. Pre-aggregated data can also provide a large performance boost when dealing with several columns across many rows. Data can be pre-aggregated in multidimensional cubes or tabular semantic models. + +- Do you need to integrate data from several sources, beyond your OLTP data store? If so, consider options that easily integrate multiple data sources. + +## Capability matrix + +The following tables summarize the key differences in capabilities. + +### General capabilities + +| | Azure Analysis Services | SQL Server Analysis Services | SQL Server with Columnstore Indexes | Azure SQL Database with Columnstore Indexes | +| --- | --- | --- | --- | --- | +| Is managed service | Yes | No | No | Yes | +| Supports multidimensional cubes | No | Yes | No | No | +| Supports tabular semantic models | Yes | Yes | No | No | +| Easily integrate multiple data sources | Yes | Yes | No 1 | No 1 | +| Supports real-time analytics | No | No | Yes | Yes | +| Requires process to copy data from source(s) | Yes | Yes | No | No | +| Azure AD integration | Yes | No | No 2 | Yes | + +[1] Although SQL Server and Azure SQL Database cannot be used to query from and integrate multiple external data sources, you can still build a pipeline that does this for you using [SSIS](/sql/integration-services/sql-server-integration-services) or [Azure Data Factory](/azure/data-factory/). SQL Server hosted in an Azure VM has additional options, such as linked servers and [PolyBase](/sql/relational-databases/polybase/polybase-guide). For more information, see [Pipeline orchestration, control flow, and data movement](../technology-choices/pipeline-orchestration-data-movement.md). + +[2] Connecting to SQL Server running on an Azure Virtual Machine is not supported using an Azure AD account. Use a domain Active Directory account instead. + +### Scalability Capabilities + +| | Azure Analysis Services | SQL Server Analysis Services | SQL Server with Columnstore Indexes | Azure SQL Database with Columnstore Indexes | +| --- | --- | --- | --- | --- | +| Redundant regional servers for high availability | Yes | No | Yes | Yes | +| Supports query scale out | Yes | No | Yes | No | +| Dynamic scalability (scale up) | Yes | No | Yes | No | diff --git a/docs/data-guide/relational-data/online-transaction-processing.md b/docs/data-guide/relational-data/online-transaction-processing.md new file mode 100644 index 00000000000..9838797182b --- /dev/null +++ b/docs/data-guide/relational-data/online-transaction-processing.md @@ -0,0 +1,138 @@ +--- +title: Online transaction processing (OLTP) +description: +author: zoinerTejada +ms:date: 02/12/2018 +--- + +# Online transaction processing (OLTP) + +The management of transactional data using computer systems is referred to as Online Transaction Processing (OLTP). OLTP systems record business interactions as they occur in the day-to-day operation of the organization, and support querying of this data to make inferences. + +## Transactional data + +Transactional data is information that tracks the interactions related to an organization's activities. These interactions are typically business transactions, such as payments received from customers, payments made to suppliers, products moving through inventory, orders taken, or services delivered. Transactional events, which represent the transactions themselves, typically contain a time dimension, some numerical values, and references to other data. + +Transactions typically need to be *atomic* and *consistent*. Atomicity means that an entire transaction always succeeds or fails as one unit of work, and is never left in a half-completed state. If a transaction cannot be completed, the database system must roll back any steps that were already done as part of that transaction. In a traditional RDBMS, this rollback happens automatically if a transaction cannot be completed. Consistency means that transactions always leave the data in a valid state. (These are very informal descriptions of atomicity and consistency. There are more formal definitions of these properties, such as [ACID](https://en.wikipedia.org/wiki/ACID).) + +Transactional databases can support strong consistency for transactions using various locking strategies, such as pessimistic locking, to ensure that all data is strongly consistent within the context of the enterprise, for all users and processes. + +The most common deployment architecture that uses transactional data is the data store tier in a 3-tier architecture. A 3-tier architecture typically consists of a presentation tier, business logic tier, and data store tier. A related deployment architecture is the [N-tier](/azure/architecture/guide/architecture-styles/n-tier) architecture, which may have multiple middle-tiers handling business logic. + +## Typical traits of transactional data + +Transactional data tends to have the following traits: + +| Requirement | Description | +| --- | --- | +| Normalization | Highly normalized | +| Schema | Schema on write, strongly enforced| +| Consistency | Strong consistency, ACID guarantees | +| Integrity | High integrity | +| Uses transactions | Yes | +| Locking strategy | Pessimistic or optimistic| +| Updateable | Yes | +| Appendable | Yes | +| Workload | Heavy writes, moderate reads | +| Indexing | Primary and secondary indexes | +| Datum size | Small to medium sized | +| Model | Relational | +| Data shape | Tabular | +| Query flexibility | Highly flexible | +| Scale | Small (MBs) to Large (a few TBs) | + +## When to use this solution + +Choose OLTP when you need to efficiently process and store business transactions and immediately make them available to client applications in a consistent way. Use this architecture when any tangible delay in processing would have a negative impact on the day-to-day operations of the business. + +OLTP systems are designed to efficiently process and store transactions, as well as query transactional data. The goal of efficiently processing and storing individual transactions by an OLTP system is partly accomplished by data normalization — that is, breaking the data up into smaller chunks that are less redundant. This supports efficiency because it enables the OLTP system to process large numbers of transactions independently, and avoids extra processing needed to maintain data integrity in the presence of redundant data. + +## Challenges +Implementing and using an OLTP system can create a few challenges: + +- OLTP systems are not always good for handling aggregates over large amounts of data, although there are exceptions, such as a well-planned SQL Server-based solution. Analytics against the data, that rely on aggregate calculations over millions of individual transactions, are very resource intensive for an OLTP system. They can be slow to execute and can cause a slow-down by blocking other transactions in the database. +- When conducting analytics and reporting on data that is highly normalized, the queries tend to be complex, because most queries need to de-normalize the data by using joins. Also, naming conventions for database objects in OLTP systems tend to be terse and succinct. The increased normalization coupled with terse naming conventions makes OLTP systems difficult for business users to query, without the help of a DBA or data developer. +- Storing the history of transactions indefinitely and storing too much data in any one table can lead to slow query performance, depending on the number of transactions stored. The common solution is to maintain a relevant window of time (such as the current fiscal year) in the OLTP system and offload historical data to other systems, such as a data mart or [data warehouse](./data-warehousing.md). + +## OLTP in Azure + +Applications such as websites hosted in [App Service Web Apps](/azure/app-service/app-service-web-overview), REST APIs running in App Service, or mobile or desktop applications communicate with the OLTP system, typically via a REST API intermediary. + +In practice, most workloads are not purely OLTP. There tends to be an analytical component as well. In addition, there is an increasing demand for real-time reporting, such as running reports against the operational system. This is also referred to as HTAP (Hybrid Transactional and Analytical Processing). For more information, see [Online Analytical Processing (OLAP)](./online-analytical-processing.md). + +In Azure, all of the following data stores will meet the core requirements for OLTP and the management of transaction data: + +- [Azure SQL Database](/azure/sql-database/) +- [SQL Server in an Azure virtual machine](/azure/virtual-machines/windows/sql/virtual-machines-windows-sql-server-iaas-overview?toc=%2Fazure%2Fvirtual-machines%2Fwindows%2Ftoc.json) +- [Azure Database for MySQL](/azure/mysql/) +- [Azure Database for PostgreSQL](/azure/postgresql/) + +## Key selection criteria + +To narrow the choices, start by answering these questions: + +- Do you want a managed service rather than managing your own servers? + +- Does your solution have specific dependencies for Microsoft SQL Server, MySQL or PostgreSQL compatibility? Your application may limit the data stores you can choose based on the drivers it supports for communicating with the data store, or the assumptions it makes about which database is used. + +- Are your write throughput requirements particularly high? If yes, choose an option that provides in-memory tables. + +- Is your solution multi-tenant? If so, consider options that support capacity pools, where multiple database instances draw from an elastic pool of resources, instead of fixed resources per database. This can help you better distribute capacity across all database instances, and can make your solution more cost effective. + +- Does your data need to be readable with low latency in multiple regions? If yes, choose an option that supports readable secondary replicas. + +- Does your database need to be highly available across geo-graphic regions? If yes, choose an option that supports geographic replication. Also consider the options that support automatic failover from the primary replica to a secondary replica. + +- Does your database have specific security needs? If yes, examine the options that provide capabilities like row level security, data masking, and transparent data encryption. + +## Capability matrix + +The following tables summarize the key differences in capabilities. + +### General capabilities +| | Azure SQL Database | SQL Server in an Azure virtual machine | Azure Database for MySQL | Azure Database for PostgreSQL | +| --- | --- | --- | --- | --- | --- | +| Is Managed Service | Yes | No | Yes | Yes | +| Runs on Platform | N/A | Windows, Linux, Docker | N/A | N/A | +| Programmability 1 | T-SQL, .NET, R | T-SQL, .NET, R, Python | T-SQL, .NET, R, Python | SQL | SQL | + +[1] Not including client driver support, which allows many programming languages to connect to and use the OLTP data store. + +### Scalability capabilities +| | Azure SQL Database | SQL Server in an Azure virtual machine| Azure Database for MySQL | Azure Database for PostgreSQL| +| --- | --- | --- | --- | --- | --- | +| Maximum database instance size | [4 TB](/azure/sql-database/sql-database-resource-limits) | 256 TB | [1 TB](/azure/mysql/concepts-limits) | [1 TB](/azure/postgresql/concepts-limits) | +| Supports capacity pools | Yes | Yes | No | No | +| Supports clusters scale out | No | Yes | No | No | +| Dynamic scalability (scale up) | Yes | No | Yes | Yes | + +### Analytic workload capabilities +| | Azure SQL Database | SQL Server in an Azure virtual machine| Azure Database for MySQL | Azure Database for PostgreSQL| +| --- | --- | --- | --- | --- | --- | +| Temporal tables | Yes | Yes | No | No | +| In-memory (memory-optimized) tables | Yes | Yes | No | No | +| Columnstore support | Yes | Yes | No | No | +| Adaptive query processing | Yes | Yes | No | No | + +### Availability capabilities +| | Azure SQL Database | SQL Server in an Azure virtual machine| Azure Database for MySQL | Azure Database for PostgreSQL| +| --- | --- | --- | --- | --- | --- | +| Readable secondaries | Yes | Yes | No | No | +| Geographic replication | Yes | Yes | No | No | +| Automatic failover to secondary | Yes | No | No | No| +| Point-in-time restore | Yes | Yes | Yes | Yes | + +### Security capabilities +| | Azure SQL Database | SQL Server in an Azure virtual machine| Azure Database for MySQL | Azure Database for PostgreSQL| +| --- | --- | --- | --- | --- | --- | +| Row level security | Yes | Yes | Yes | Yes | +| Data masking | Yes | Yes | No | No | +| Transparent data encryption | Yes | Yes | Yes | Yes | +| Restrict access to specific IP addresses | Yes | Yes | Yes | Yes | +| Restrict access to allow VNET access only | Yes | Yes | No | No | +| Azure Active Directory authentication | Yes | Yes | No | No | +| Active Directory authentication | No | Yes | No | No | +| Multi-factor authentication | Yes | Yes | No | No | +| Supports [Always Encrypted](/sql/relational-databases/security/encryption/always-encrypted-database-engine) | Yes | Yes | Yes | No | No | +| Private IP | No | Yes | Yes | No | No | + diff --git a/docs/data-guide/concepts/advanced-analytics.md b/docs/data-guide/scenarios/advanced-analytics.md similarity index 100% rename from docs/data-guide/concepts/advanced-analytics.md rename to docs/data-guide/scenarios/advanced-analytics.md diff --git a/docs/data-guide/concepts/csv-and-json.md b/docs/data-guide/scenarios/csv-and-json.md similarity index 98% rename from docs/data-guide/concepts/csv-and-json.md rename to docs/data-guide/scenarios/csv-and-json.md index 7bdd87c61c9..c5e09599271 100644 --- a/docs/data-guide/concepts/csv-and-json.md +++ b/docs/data-guide/scenarios/csv-and-json.md @@ -40,7 +40,7 @@ CSV and JSON file formats both make it easy to exchange data between dissimilar Azure provides several solutions for working with CSV and JSON files, depending on your needs. The primary landing place for these files is either Azure Storage or Azure Data Lake Store. Most Azure services that work with these and other text-based files integrate with either object storage service. In some situations, however, you may opt to directly import the data into Azure SQL or some other data store. SQL Server has native support for storing and working with JSON documents, which makes it easy to [import and process those types of files](/sql/relational-databases/json/import-json-documents-into-sql-server). You can use a utility like SQL Bulk Import to easily [import CSV files](/sql/relational-databases/json/import-json-documents-into-sql-server). -Depending on the scenario, you may perform [batch processing](../scenarios/batch-processing.md) or [real-time processing](../scenarios/real-time-processing.md) of the data. +Depending on the scenario, you may perform [batch processing](../big-data/batch-processing.md) or [real-time processing](../big-data/real-time-processing.md) of the data. ## Challenges diff --git a/docs/data-guide/concepts/data-lake.md b/docs/data-guide/scenarios/data-lake.md similarity index 82% rename from docs/data-guide/concepts/data-lake.md rename to docs/data-guide/scenarios/data-lake.md index bbd186c6c94..16447bf683c 100644 --- a/docs/data-guide/concepts/data-lake.md +++ b/docs/data-guide/scenarios/data-lake.md @@ -1,6 +1,6 @@ # Data lakes -A data lake is a storage repository that holds a large amount of data in its native, raw format. Data lake stores are optimized for scaling to terabytes and petabytes of data. The data typically comes from multiple heterogeneous sources, and may be structured, semi-structured, or unstructured. The idea with a data lake is to store everything in its original, untransformed state. This approach differs from a traditional [data warehouse](../scenarios/data-warehousing.md), which transforms and processes the data at the time of ingestion. +A data lake is a storage repository that holds a large amount of data in its native, raw format. Data lake stores are optimized for scaling to terabytes and petabytes of data. The data typically comes from multiple heterogeneous sources, and may be structured, semi-structured, or unstructured. The idea with a data lake is to store everything in its original, untransformed state. This approach differs from a traditional [data warehouse](../relational-data/data-warehousing.md), which transforms and processes the data at the time of ingestion. Advantages of a data lake: @@ -13,9 +13,9 @@ A complete data lake solution consists of both storage and processing. Data lake ## When to use a data lake -Typical uses for a data lake include [data exploration](../scenarios/interactive-data-exploration.md), data analytics, and machine learning. +Typical uses for a data lake include [data exploration](./interactive-data-exploration.md), data analytics, and machine learning. -A data lake can also act as the data source for a data warehouse. With this approach, the raw data is ingested into the data lake and then transformed into a structured queryable format. Typically this transformation uses an [ELT](../scenarios/etl.md#extract-load-and-transform-elt) (extract-load-transform) pipeline, where the data is ingested and transformed in place. Source data that is already relational may go directly into the data warehouse, using an ETL process, skipping the data lake. +A data lake can also act as the data source for a data warehouse. With this approach, the raw data is ingested into the data lake and then transformed into a structured queryable format. Typically this transformation uses an [ELT](../relational-data/etl.md#extract-load-and-transform-elt) (extract-load-transform) pipeline, where the data is ingested and transformed in place. Source data that is already relational may go directly into the data warehouse, using an ETL process, skipping the data lake. Data lake stores are often used in event streaming or IoT scenarios, because they can persist large amounts of relational and nonrelational data without transformation or schema definition. They are built to handle high volumes of small writes at low latency, and are optimized for massive throughput. diff --git a/docs/data-guide/scenarios/data-warehousing.md b/docs/data-guide/scenarios/data-warehousing.md deleted file mode 100644 index 0c5916bc719..00000000000 --- a/docs/data-guide/scenarios/data-warehousing.md +++ /dev/null @@ -1,61 +0,0 @@ ---- -title: Data warehousing and data marts -description: -author: zoinerTejada -ms:date: 02/12/2018 ---- - -# Data warehousing and data marts - -A data warehouse is a central, organizational, relational repository of integrated data from one or more disparate sources, across many or all subject areas. Data warehouses store current and historical data and are used for reporting and analysis of the data in different ways. - -![Data warehousing in Azure](./images/data-warehousing.png) - -To move data into a data warehouse, it is extracted on a periodic basis from various sources that contain important business information. As the data is moved, it can be formatted, cleaned, validated, summarized, and reorganized. Alternately, the data can be stored in the lowest level of detail, with aggregated views provided in the warehouse for reporting. In either case, the data warehouse becomes a permanent storage space for data used for reporting, analysis, and forming important business decisions using business intelligence (BI) tools. - -## Data marts and operational data stores - -Managing data at scale is complex, and it is becoming less common to have a single data warehouse that represents all data across the entire enterprise. Instead, organizations create smaller, more focused data warehouses, called *data marts*, that expose the desired data for analytics purposes. An orchestration process populates the data marts from data maintained in an operational data store. The operational data store acts as an intermediary between the source transactional system and the data mart. Data managed by the operational data store is a cleaned version of the data present in the source transactional system, and is typically a subset of the historical data that is maintained by the data warehouse or data mart. - -## When to use this solution - -Choose a data warehouse when you need to turn massive amounts of data from operational systems into a format that is easy to understand, current, and accurate. Data warehouses do not need to follow the same terse data structure you may be using in your operational/OLTP databases. You can use column names that make sense to business users and analysts, restructure the schema to simplify data relationships, and consolidate several tables into one. These steps help guide users who need to create ad hoc reports, or create reports and analyze the data in BI systems, without the help of a database administrator (DBA) or data developer. - -Consider using a data warehouse when you need to keep historical data separate from the source transaction systems for performance reasons. Data warehouses make it easy to access historical data from multiple locations, by providing a centralized location using common formats, common keys, common data models, and common access methods. - -Data warehouses are optimized for read access, resulting in faster report generation compared to running reports against the source transaction system. In addition, data warehouses provide the following benefits: - -* All historical data from multiple sources can be stored and accessed from a data warehouse as the single source of truth. -* You can improve data quality by cleaning up data as it is imported into the data warehouse, providing more accurate data as well as providing consistent codes and descriptions. -* Reporting tools do not compete with the transactional source systems for query processing cycles. A data warehouse allows the transactional system to focus predominantly on handling writes, while the data warehouse satisfies the majority of read requests. -* A data warehouse can help consolidate data from different software. -* Data mining tools can help you find hidden patterns using automatic methodologies against data stored in your warehouse. -* Data warehouses make it easier to provide secure access to authorized users, while restricting access to others. There is no need to grant business users access to the source data, thereby removing a potential attack vector against one or more production transaction systems. -* Data warehouses make it easier to create business intelligence solutions on top of the data, such as [OLAP cubes](online-analytical-processing.md). - -## Challenges - -Properly configuring a data warehouse to fit the needs of your business can bring some of the following challenges: - -* Committing the time required to properly model your business concepts. This is an important step, as data warehouses are information driven, where concept mapping drives the rest of the project. This involves standardizing business-related terms and common formats (such as currency and dates), and restructuring the schema in a way that makes sense to business users but still ensures accuracy of data aggregates and relationships. -* Planning and setting up your data orchestration. Consideration include how to copy data from the source transactional system to the data warehouse, and when to move historical data out of your operational data stores and into the warehouse. -* Maintaining or improving data quality by cleaning the data as it is imported into the warehouse. - -## Data warehousing in Azure - -In Azure, you may have one or more sources of data, whether from customer transactions, or from various business applications used by various departments. This data is traditionally stored in one or more [OLTP](online-transaction-processing.md) databases. The data could be persisted in other storage mediums such as network shares, Azure Storage Blobs, or a data lake. The data could also be stored by the data warehouse itself or in a relational database such as Azure SQL Database. The purpose of the analytical data store layer is to satisfy queries issued by analytics and reporting tools against the data warehouse or data mart. In Azure, this analytical store capability can be met with Azure SQL Data Warehouse, or with Azure HDInsight using Hive or Interactive Query. In addition, you will need some level of orchestration to periodically move or copy data from data storage to the data warehouse, which can be done using Azure Data Factory or Oozie on Azure HDInsight. - -Related services: - -* [Azure SQL Database](/azure/sql-database/) -* [SQL Server in a VM](/sql/sql-server/sql-server-technical-documentation) -* [Azure Data Warehouse](/azure/sql-data-warehouse/sql-data-warehouse-overview-what-is) -* [Apache Hive on HDInsight](/azure/hdinsight/hadoop/hdinsight-use-hive) -* [Interactive Query (Hive LLAP) on HDInsight](/azure/hdinsight/interactive-query/apache-interactive-query-get-started) - - -## Technology choices - -- [Data warehouses](../technology-choices/data-warehouses.md) -- [Pipeline orchestration](../technology-choices/pipeline-orchestration-data-movement.md) - diff --git a/docs/data-guide/scenarios/images/oltp-data-pipeline.png b/docs/data-guide/scenarios/images/oltp-data-pipeline.png deleted file mode 100644 index deeadfcb806..00000000000 Binary files a/docs/data-guide/scenarios/images/oltp-data-pipeline.png and /dev/null differ diff --git a/docs/data-guide/scenarios/interactive-data-exploration.md b/docs/data-guide/scenarios/interactive-data-exploration.md index 7345ec86aee..14fe70bb5b2 100644 --- a/docs/data-guide/scenarios/interactive-data-exploration.md +++ b/docs/data-guide/scenarios/interactive-data-exploration.md @@ -48,7 +48,7 @@ Relevant Azure services: ## Architecture -Although the goal of this scenario is to support interactive data analysis, the data cleansing, sampling, and structuring tasks involved in data science often include long-running processes. That makes a [batch processing](./batch-processing.md) architecture appropriate. +Although the goal of this scenario is to support interactive data analysis, the data cleansing, sampling, and structuring tasks involved in data science often include long-running processes. That makes a [batch processing](../big-data/batch-processing.md) architecture appropriate. ## Technology choices diff --git a/docs/data-guide/scenarios/natural-language-processing.md b/docs/data-guide/scenarios/natural-language-processing.md index 30551a66847..d5744b182cc 100644 --- a/docs/data-guide/scenarios/natural-language-processing.md +++ b/docs/data-guide/scenarios/natural-language-processing.md @@ -30,7 +30,7 @@ When using NLP to extract information and insight from free-form text, the start ## Architecture -In an NLP solution, free-form text processing is performed against documents containing paragraphs of text. The overall architecture can be a [batch processing](./batch-processing.md) or [real-time stream processing](./real-time-processing.md) architecture. +In an NLP solution, free-form text processing is performed against documents containing paragraphs of text. The overall architecture can be a [batch processing](../big-data/batch-processing.md) or [real-time stream processing](../big-data/real-time-processing.md) architecture. The actual processing varies based on the desired outcome, but in terms of the pipeline, NLP may be applied in a batch or real-time fashion. For example, sentiment analysis can be used against blocks of text to produce a sentiment score. This can could be done by running a batch process against data in storage, or in real time using smaller chunks of data flowing through a messaging service. diff --git a/docs/data-guide/scenarios/online-analytical-processing.md b/docs/data-guide/scenarios/online-analytical-processing.md deleted file mode 100644 index 466bb32219d..00000000000 --- a/docs/data-guide/scenarios/online-analytical-processing.md +++ /dev/null @@ -1,42 +0,0 @@ ---- -title: Online analytical processing (OLAP) -description: -author: zoinerTejada -ms:date: 02/12/2018 ---- - -# Online analytical processing (OLAP) - -Online analytical processing (OLAP) is a technology that organizes large business databases and supports complex analysis. It can be used to perform complex analytical queries without negatively affecting transactional systems. - -The databases that a business uses to store all its transactions and records are called [online transaction processing (OLTP)](online-transaction-processing.md) databases. These databases usually have records that are entered one at a time. Often they contain a great deal of information that is valuable to the organization. The databases that are used for OLTP, however, were not designed for analysis. Therefore, retrieving answers from these databases is costly in terms of time and effort. OLAP systems were designed to help extract this business intelligence information from the data in a highly performant way. This is because OLAP databases are optimized for heavy read, low write workloads. - -![OLAP in Azure](./images/olap-data-pipeline.png) - -## When to use this solution - -Consider OLAP in the following scenarios: - -- You need to execute complex analytical and ad hoc queries rapidly, without negatively affecting your OLTP systems. -- You want to provide business users with a simple way to generate reports from your data -- You want to provide a number of aggregations that will allow users to get fast, consistent results. - -OLAP is especially useful for applying aggregate calculations over large amounts of data. OLAP systems are optimized for read-heavy scenarios, such as analytics and business intelligence. OLAP allows users to segment multi-dimensional data into slices that can be viewed in two dimensions (such as a pivot table) or filter the data by specific values. This process is sometimes called "slicing and dicing" the data, and can be done regardless of whether the data is partitioned across several data sources. This helps users to find trends, spot patterns, and explore the data without having to know the details of traditional data analysis. - -[Semantic models](../concepts/semantic-modeling.md) can help business users abstract relationship complexities and make it easier to analyze data quickly. - -## Challenges - -For all the benefits OLAP systems provide, they do produce a few challenges: - -- Whereas data in OLTP systems is constantly updated through transactions flowing in from various sources, OLAP data stores are typically refreshed at a much slower intervals, depending on business needs. This means OLAP systems are better suited for strategic business decisions, rather than immediate responses to changes. Also, some level of data cleansing and orchestration needs to be planned to keep the OLAP data stores up-to-date. -- Unlike traditional, normalized, relational tables found in OLTP systems, OLAP data models tend to be multidimensional. This makes it difficult or impossible to directly map to entity-relationship or object-oriented models, where each attribute is mapped to one column. Instead, OLAP systems typically use a star or snowflake schema in place of traditional normalization. - -## OLAP in Azure - -In Azure, data held in OLTP systems such as Azure SQL Database is copied into the OLAP system, such as [Azure Analysis Services](/azure/analysis-services/analysis-services-overview). Data exploration and visualization tools like [Power BI](https://powerbi.microsoft.com), Excel, and third-party options connect to Analysis Services servers and provide users with highly interactive and visually rich insights into the modeled data. The flow of data from OLTP data to OLAP is typically orchestrated using SQL Server Integration Services, which can be executed using [Azure Data Factory](/azure/data-factory/concepts-integration-runtime). - -## Technology choices - -- [Online Analytical Processing (OLAP) data stores](../technology-choices/olap-data-stores.md) - diff --git a/docs/data-guide/scenarios/online-transaction-processing.md b/docs/data-guide/scenarios/online-transaction-processing.md deleted file mode 100644 index 37e7bb4f483..00000000000 --- a/docs/data-guide/scenarios/online-transaction-processing.md +++ /dev/null @@ -1,48 +0,0 @@ ---- -title: Online transaction processing (OLTP) -description: -author: zoinerTejada -ms:date: 02/12/2018 ---- - -# Online transaction processing (OLTP) - -The management of [transactional data](../concepts/transactional-data.md) using computer systems is referred to as Online Transaction Processing (OLTP). OLTP systems record business interactions as they occur in the day-to-day operation of the organization, and support querying of this data to make inferences. - -![OLTP in Azure](./images/oltp-data-pipeline.png) - -## When to use this solution - -Choose OLTP when you need to efficiently process and store business transactions and immediately make them available to client applications in a consistent way. Use this architecture when any tangible delay in processing would have a negative impact on the day-to-day operations of the business. - -OLTP systems are designed to efficiently process and store transactions, as well as query transactional data. The goal of efficiently processing and storing individual transactions by an OLTP system is partly accomplished by data normalization — that is, breaking the data up into smaller chunks that are less redundant. This supports efficiency because it enables the OLTP system to process large numbers of transactions independently, and avoids extra processing needed to maintain data integrity in the presence of redundant data. - -## Challenges -Implementing and using an OLTP system can create a few challenges: - -- OLTP systems are not always good for handling aggregates over large amounts of data, although there are exceptions, such as a well-planned SQL Server-based solution. Analytics against the data, that rely on aggregate calculations over millions of individual transactions, are very resource intensive for an OLTP system. They can be slow to execute and can cause a slow-down by blocking other transactions in the database. -- When conducting analytics and reporting on data that is highly normalized, the queries tend to be complex, because most queries need to de-normalize the data by using joins. Also, naming conventions for database objects in OLTP systems tend to be terse and succinct. The increased normalization coupled with terse naming conventions makes OLTP systems difficult for business users to query, without the help of a DBA or data developer. -- Storing the history of transactions indefinitely and storing too much data in any one table can lead to slow query performance, depending on the number of transactions stored. The common solution is to maintain a relevant window of time (such as the current fiscal year) in the OLTP system and offload historical data to other systems, such as a data mart or [data warehouse](../technology-choices/data-warehouses.md). - -## OLTP in Azure - -Applications such as websites hosted in [App Service Web Apps](/azure/app-service/app-service-web-overview), REST APIs running in App Service, or mobile or desktop applications communicate with the OLTP system, typically via a REST API intermediary. - -In practice, most workloads are not purely OLTP. There tends to be an [analytical component](../scenarios/online-analytical-processing.md) as well. In addition, there is an increasing demand for real-time reporting, such as running reports against the operational system. This is also referred to as HTAP (Hybrid Transactional and Analytical Processing). For more information, see [Online Analytical Processing (OLAP) data stores](../technology-choices/olap-data-stores.md). - -## Technology choices - -Data storage: - -- [Azure SQL Database](/azure/sql-database/) -- [SQL Server in an Azure VM](/azure/virtual-machines/windows/sql/virtual-machines-windows-sql-server-iaas-overview?toc=%2Fazure%2Fvirtual-machines%2Fwindows%2Ftoc.json) -- [Azure Database for MySQL](/azure/mysql/) -- [Azure Database for PostgreSQL](/azure/postgresql/) - -For more information, see [Choosing an OLTP data store](../technology-choices/oltp-data-stores.md) - -Data sources: - -- [App service](/azure/app-service/) -- [Mobile Apps](/azure/app-service-mobile/) - diff --git a/docs/data-guide/scenarios/time-series.md b/docs/data-guide/scenarios/time-series.md index 50a8dce3fd0..f25cb8a2d5b 100644 --- a/docs/data-guide/scenarios/time-series.md +++ b/docs/data-guide/scenarios/time-series.md @@ -38,7 +38,7 @@ Using time series offers the following benefits: Data collected by IoT devices is a natural fit for time series storage and analysis. The incoming data is inserted and rarely, if ever, updated. The data is time stamped and inserted in the order it was received, and this data is typically displayed in chronological order, enabling users to discover trends, spot anomalies, and use the information for predictive analysis. -For more information, see [Internet of Things](../concepts/big-data.md#internet-of-things-iot). +For more information, see [Internet of Things](../big-data/index.md#internet-of-things-iot). ### Real-time analytics @@ -53,7 +53,7 @@ Ideally, you would have a stream processing layer that can handle the incoming d ## Architecture -In many scenarios that involve time series data, such as IoT, the data is captured in real time. As such, a [real-time processing](./real-time-processing.md) architecture is appropriate. +In many scenarios that involve time series data, such as IoT, the data is captured in real time. As such, a [real-time processing](../big-data/real-time-processing.md) architecture is appropriate. Data from one or more data sources is ingested into the stream buffering layer by [IoT Hub](/azure/iot-hub/), [Event Hubs](/azure/event-hubs/), or [Kafka on HDInsight](/azure/hdinsight/kafka/apache-kafka-introduction). Next, the data is processed in the stream processing layer that can optionally hand off the processed data to a machine learning service for predictive analytics. The processed data is stored in an analytical data store, such as [HBase](/azure/hdinsight/hbase/apache-hbase-overview), [Azure Cosmos DB](/azure/cosmos-db/), Azure Data Lake, or Blob Storage. An analytics and reporting application or service, like Power BI or OpenTSDB (if stored in HBase) can be used to display the time series data for analysis. diff --git a/docs/data-guide/technology-choices/analytical-data-stores.md b/docs/data-guide/technology-choices/analytical-data-stores.md index e88a309f11e..2a60a7ee039 100644 --- a/docs/data-guide/technology-choices/analytical-data-stores.md +++ b/docs/data-guide/technology-choices/analytical-data-stores.md @@ -7,9 +7,9 @@ ms:date: 02/12/2018 # Choosing an analytical data store in Azure -In a [big data](../concepts/big-data.md) architecture, there is often a need for an analytical data store that serves processed data in a structured format that can be queried using analytical tools. Analytical data stores that support querying of both hot-path and cold-path data are collectively referred to as the serving layer, or data serving storage. +In a [big data](../big-data/index.md) architecture, there is often a need for an analytical data store that serves processed data in a structured format that can be queried using analytical tools. Analytical data stores that support querying of both hot-path and cold-path data are collectively referred to as the serving layer, or data serving storage. -The serving layer deals with processed data from both the hot path and cold path. In the [lambda architecture](../concepts/big-data.md#lambda-architecture), the serving layer is subdivided into a _speed serving_ layer, which stores data that has been processed incrementally, and a _batch serving_ layer, which contains the batch-processed output. The serving layer requires strong support for random reads with low latency. Data storage for the speed layer should also support random writes, because batch loading data into this store would introduce undesired delays. On the other hand, data storage for the batch layer does not need to support random writes, but batch writes instead. +The serving layer deals with processed data from both the hot path and cold path. In the [lambda architecture](../big-data/index.md#lambda-architecture), the serving layer is subdivided into a _speed serving_ layer, which stores data that has been processed incrementally, and a _batch serving_ layer, which contains the batch-processed output. The serving layer requires strong support for random reads with low latency. Data storage for the speed layer should also support random writes, because batch loading data into this store would introduce undesired delays. On the other hand, data storage for the batch layer does not need to support random writes, but batch writes instead. There is no single best data management choice for all data storage tasks. Different data management solutions are optimized for different tasks. Most real-world cloud apps and big data processes have a variety of data storage requirements and often use a combination of data storage solutions. diff --git a/docs/data-guide/technology-choices/olap-data-stores.md b/docs/data-guide/technology-choices/olap-data-stores.md deleted file mode 100644 index 41bf6636738..00000000000 --- a/docs/data-guide/technology-choices/olap-data-stores.md +++ /dev/null @@ -1,72 +0,0 @@ ---- -title: Choosing an OLAP data store -description: -author: zoinerTejada -ms:date: 02/12/2018 ---- - -# Choosing an OLAP data store in Azure - -Online analytical processing (OLAP) is a technology that organizes large business databases and supports complex analysis. This topic compares the options for OLAP solutions in Azure. - -> [!NOTE] -> For more information about when to use an OLAP data store, see [Online analytical processing](../scenarios/online-analytical-processing.md). - -## What are your options when choosing an OLAP data store? - -In Azure, all of the following data stores will meet the core requirements for OLAP: - -- [SQL Server with Columnstore indexes](/sql/relational-databases/indexes/get-started-with-columnstore-for-real-time-operational-analytics) -- [Azure Analysis Services](/azure/analysis-services/analysis-services-overview) -- [SQL Server Analysis Services (SSAS)](/sql/analysis-services/analysis-services) - -SQL Server Analysis Services (SSAS) offers OLAP and data mining functionality for business intelligence applications. You can either install SSAS on local servers, or host within a virtual machine in Azure. Azure Analysis Services is a fully managed service that provides the same major features as SSAS. Azure Analysis Services supports connecting to [various data sources](/azure/analysis-services/analysis-services-datasource) in the cloud and on-premises in your organization. - -Clustered Columnstore indexes are available in SQL Server 2014 and above, as well as Azure SQL Database, and are ideal for OLAP workloads. However, beginning with SQL Server 2016 (including Azure SQL Database), you can take advantage of hybrid transactional/analytics processing (HTAP) through the use of updateable nonclustered columnstore indexes. HTAP enables you to perform OLTP and OLAP processing on the same platform, which removes the need to store multiple copies of your data, and eliminates the need for distinct OLTP and OLAP systems. For more information, see [Get started with Columnstore for real-time operational analytics](/sql/relational-databases/indexes/get-started-with-columnstore-for-real-time-operational-analytics). - -## Key selection criteria - -To narrow the choices, start by answering these questions: - -- Do you want a managed service rather than managing your own servers? - -- Do you require secure authentication using Azure Active Directory (Azure AD)? - -- Do you want to conduct real-time analytics? If so, narrow your options to those that support real-time analytics. - - *Real-time analytics* in this context applies to a single data source, such as an enterprise resource planning (ERP) application, that will run both an operational and an analytics workload. If you need to integrate data from multiple sources, or require extreme analytics performance by using pre-aggregated data such as cubes, you might still require a separate data warehouse. - -- Do you need to use pre-aggregated data, for example to provide semantic models that make analytics more business user friendly? If yes, choose an option that supports multidimensional cubes or tabular semantic models. - - Providing aggregates can help users consistently calculate data aggregates. Pre-aggregated data can also provide a large performance boost when dealing with several columns across many rows. Data can be pre-aggregated in multidimensional cubes or tabular semantic models. - -- Do you need to integrate data from several sources, beyond your OLTP data store? If so, consider options that easily integrate multiple data sources. - -## Capability matrix - -The following tables summarize the key differences in capabilities. - -### General capabilities - -| | Azure Analysis Services | SQL Server Analysis Services | SQL Server with Columnstore Indexes | Azure SQL Database with Columnstore Indexes | -| --- | --- | --- | --- | --- | -| Is managed service | Yes | No | No | Yes | -| Supports multidimensional cubes | No | Yes | No | No | -| Supports tabular semantic models | Yes | Yes | No | No | -| Easily integrate multiple data sources | Yes | Yes | No 1 | No 1 | -| Supports real-time analytics | No | No | Yes | Yes | -| Requires process to copy data from source(s) | Yes | Yes | No | No | -| Azure AD integration | Yes | No | No 2 | Yes | - -[1] Although SQL Server and Azure SQL Database cannot be used to query from and integrate multiple external data sources, you can still build a pipeline that does this for you using [SSIS](/sql/integration-services/sql-server-integration-services) or [Azure Data Factory](/azure/data-factory/). SQL Server hosted in an Azure VM has additional options, such as linked servers and [PolyBase](/sql/relational-databases/polybase/polybase-guide). For more information, see [Pipeline orchestration, control flow, and data movement](../technology-choices/pipeline-orchestration-data-movement.md). - -[2] Connecting to SQL Server running on an Azure Virtual Machine is not supported using an Azure AD account. Use a domain Active Directory account instead. - -### Scalability Capabilities - -| | Azure Analysis Services | SQL Server Analysis Services | SQL Server with Columnstore Indexes | Azure SQL Database with Columnstore Indexes | -| --- | --- | --- | --- | --- | -| Redundant regional servers for high availability | Yes | No | Yes | Yes | -| Supports query scale out | Yes | No | Yes | No | -| Dynamic scalability (scale up) | Yes | No | Yes | No | - diff --git a/docs/data-guide/technology-choices/oltp-data-stores.md b/docs/data-guide/technology-choices/oltp-data-stores.md deleted file mode 100644 index 24213c8fea2..00000000000 --- a/docs/data-guide/technology-choices/oltp-data-stores.md +++ /dev/null @@ -1,92 +0,0 @@ ---- -title: Choosing an OLTP data store -description: -author: zoinerTejada -ms:date: 02/12/2018 ---- - -# Choosing an OLTP data store in Azure - -Online transaction processing (OLTP) is the management of transactional data and transaction processing. This topic compares options for OLTP solutions in Azure. - -> [!NOTE] -> For more information about when to use an OLTP data store, see [Online transaction processing](../scenarios/online-analytical-processing.md). - -## What are your options when choosing an OLTP data store? - -In Azure, all of the following data stores will meet the core requirements for OLTP and the management of transaction data: - -- [Azure SQL Database](/azure/sql-database/) -- [SQL Server in an Azure virtual machine](/azure/virtual-machines/windows/sql/virtual-machines-windows-sql-server-iaas-overview?toc=%2Fazure%2Fvirtual-machines%2Fwindows%2Ftoc.json) -- [Azure Database for MySQL](/azure/mysql/) -- [Azure Database for PostgreSQL](/azure/postgresql/) - -## Key selection criteria - -To narrow the choices, start by answering these questions: - -- Do you want a managed service rather than managing your own servers? - -- Does your solution have specific dependencies for Microsoft SQL Server, MySQL or PostgreSQL compatibility? Your application may limit the data stores you can choose based on the drivers it supports for communicating with the data store, or the assumptions it makes about which database is used. - -- Are your write throughput requirements particularly high? If yes, choose an option that provides in-memory tables. - -- Is your solution multi-tenant? If so, consider options that support capacity pools, where multiple database instances draw from an elastic pool of resources, instead of fixed resources per database. This can help you better distribute capacity across all database instances, and can make your solution more cost effective. - -- Does your data need to be readable with low latency in multiple regions? If yes, choose an option that supports readable secondary replicas. - -- Does your database need to be highly available across geo-graphic regions? If yes, choose an option that supports geographic replication. Also consider the options that support automatic failover from the primary replica to a secondary replica. - -- Does your database have specific security needs? If yes, examine the options that provide capabilities like row level security, data masking, and transparent data encryption. - -## Capability matrix - -The following tables summarize the key differences in capabilities. - -### General capabilities -| | Azure SQL Database | SQL Server in an Azure virtual machine | Azure Database for MySQL | Azure Database for PostgreSQL | -| --- | --- | --- | --- | --- | --- | -| Is Managed Service | Yes | No | Yes | Yes | -| Runs on Platform | N/A | Windows, Linux, Docker | N/A | N/A | -| Programmability 1 | T-SQL, .NET, R | T-SQL, .NET, R, Python | T-SQL, .NET, R, Python | SQL | SQL | - -[1] Not including client driver support, which allows many programming languages to connect to and use the OLTP data store. - -### Scalability capabilities -| | Azure SQL Database | SQL Server in an Azure virtual machine| Azure Database for MySQL | Azure Database for PostgreSQL| -| --- | --- | --- | --- | --- | --- | -| Maximum database instance size | [4 TB](/azure/sql-database/sql-database-resource-limits) | 256 TB | [1 TB](/azure/mysql/concepts-limits) | [1 TB](/azure/postgresql/concepts-limits) | -| Supports capacity pools | Yes | Yes | No | No | -| Supports clusters scale out | No | Yes | No | No | -| Dynamic scalability (scale up) | Yes | No | Yes | Yes | - -### Analytic workload capabilities -| | Azure SQL Database | SQL Server in an Azure virtual machine| Azure Database for MySQL | Azure Database for PostgreSQL| -| --- | --- | --- | --- | --- | --- | -| Temporal tables | Yes | Yes | No | No | -| In-memory (memory-optimized) tables | Yes | Yes | No | No | -| Columnstore support | Yes | Yes | No | No | -| Adaptive query processing | Yes | Yes | No | No | - -### Availability capabilities -| | Azure SQL Database | SQL Server in an Azure virtual machine| Azure Database for MySQL | Azure Database for PostgreSQL| -| --- | --- | --- | --- | --- | --- | -| Readable secondaries | Yes | Yes | No | No | -| Geographic replication | Yes | Yes | No | No | -| Automatic failover to secondary | Yes | No | No | No| -| Point-in-time restore | Yes | Yes | Yes | Yes | - -### Security capabilities -| | Azure SQL Database | SQL Server in an Azure virtual machine| Azure Database for MySQL | Azure Database for PostgreSQL| -| --- | --- | --- | --- | --- | --- | -| Row level security | Yes | Yes | Yes | Yes | -| Data masking | Yes | Yes | No | No | -| Transparent data encryption | Yes | Yes | Yes | Yes | -| Restrict access to specific IP addresses | Yes | Yes | Yes | Yes | -| Restrict access to allow VNET access only | Yes | Yes | No | No | -| Azure Active Directory authentication | Yes | Yes | No | No | -| Active Directory authentication | No | Yes | No | No | -| Multi-factor authentication | Yes | Yes | No | No | -| Supports [Always Encrypted](/sql/relational-databases/security/encryption/always-encrypted-database-engine) | Yes | Yes | Yes | No | No | -| Private IP | No | Yes | Yes | No | No | - diff --git a/docs/data-guide/technology-choices/stream-processing.md b/docs/data-guide/technology-choices/stream-processing.md index 42818a9524f..82c4cd4f56d 100644 --- a/docs/data-guide/technology-choices/stream-processing.md +++ b/docs/data-guide/technology-choices/stream-processing.md @@ -10,7 +10,7 @@ ms:date: 02/12/2018 This article compares technology choices for real-time stream processing in Azure. -Real-time stream processing consumes messages from either queue or file-based storage, process the messages, and forward the result to another message queue, file store, or database. Processing may include querying, filtering, and aggregating messages. Stream processing engines must be able to consume an endless streams of data and produce results with minimal latency. For more information, see [Real time processing](../scenarios/real-time-processing.md). +Real-time stream processing consumes messages from either queue or file-based storage, process the messages, and forward the result to another message queue, file store, or database. Processing may include querying, filtering, and aggregating messages. Stream processing engines must be able to consume an endless streams of data and produce results with minimal latency. For more information, see [Real time processing](../big-data/real-time-processing.md). ## What are your options when choosing a technology for real-time processing? In Azure, all of the following data stores will meet the core requirements supporting real-time processing: @@ -62,4 +62,4 @@ See also: - [Choosing a real-time message ingestion technology](./real-time-ingestion.md) - [Comparing Apache Storm and Azure Stream Analytics](/azure/stream-analytics/stream-analytics-comparison-storm) -- [Real time processing](../scenarios/real-time-processing.md) +- [Real time processing](../big-data/real-time-processing.md) diff --git a/docs/data-guide/toc.yml b/docs/data-guide/toc.yml index 9b3e0d5a109..3887c018d40 100644 --- a/docs/data-guide/toc.yml +++ b/docs/data-guide/toc.yml @@ -3,88 +3,70 @@ - name: Data Architecture Guide href: index.md items: - - name: Traditional RDBMS + - name: Traditional RDBMS workloads items: - - name: Concepts - items: - - name: Relational data - href: concepts/relational-data.md - - name: Transactional data - href: concepts/transactional-data.md - - name: Semantic modeling - href: concepts/semantic-modeling.md - - name: Scenarios - items: - - name: Online transaction processing (OLTP) - href: scenarios/online-transaction-processing.md - - name: Online analytical processing (OLAP) - href: scenarios/online-analytical-processing.md - - name: Data warehousing - href: scenarios/data-warehousing.md - - name: Extract, transform, and load (ETL) - href: scenarios/etl.md - - name: Technology choices - items: - - name: Data warehouses - href: technology-choices/data-warehouses.md - - name: Online analytical processing (OLAP) - href: technology-choices/olap-data-stores.md - - name: Online transaction processing (OLTP) - href: technology-choices/oltp-data-stores.md - - name: Big Data and NoSQL + - name: Overview + href: relational-data/index.md + - name: Online transaction processing (OLTP) + href: relational-data/online-transaction-processing.md + - name: Data warehousing + href: relational-data/data-warehousing.md + - name: Online analytical processing (OLAP) + href: relational-data/online-analytical-processing.md + - name: Extract, transform, and load (ETL) + href: relational-data/etl.md + - name: Big data architectures items: - - name: Concepts - items: - - name: Non-relational data stores - href: concepts/non-relational-data.md - - name: Working with CSV and JSON files - href: concepts/csv-and-json.md - - name: Data lakes - href: concepts/data-lake.md - - name: Big data architectures - href: concepts/big-data.md - - name: Advanced analytics - href: concepts/advanced-analytics.md - - name: Machine learning at scale - href: concepts/machine-learning-at-scale.md - - name: Scenarios - items: + - name: Overview + href: big-data/index.md - name: Batch processing - href: scenarios/batch-processing.md + href: big-data/batch-processing.md - name: Real time processing - href: scenarios/real-time-processing.md - - name: Free-form text search - href: scenarios/search.md - - name: Interactive data exploration - href: scenarios/interactive-data-exploration.md - - name: Natural language processing - href: scenarios/natural-language-processing.md - - name: Time series solutions - href: scenarios/time-series.md - - name: Technology choices - items: - - name: Analytical data stores - href: technology-choices/analytical-data-stores.md - - name: Analytics and reporting - href: technology-choices/analysis-visualizations-reporting.md - - name: Batch processing - href: technology-choices/batch-processing.md - - name: Cognitive services - href: technology-choices/cognitive-services.md - - name: Data storage - href: technology-choices/data-storage.md - - name: Machine learning - href: technology-choices/data-science-and-machine-learning.md - - name: Natural language processing - href: technology-choices/natural-language-processing.md - - name: Pipeline orchestration - href: technology-choices/pipeline-orchestration-data-movement.md - - name: Real-time message ingestion - href: technology-choices/real-time-ingestion.md - - name: Search data stores - href: technology-choices/search-options.md - - name: Stream processing - href: technology-choices/stream-processing.md + href: big-data/real-time-processing.md + - name: Machine learning at scale + href: big-data/machine-learning-at-scale.md + - name: Non-relational data stores + href: big-data/non-relational-data.md + - name: Scenarios + items: + - name: Advanced analytics + href: scenarios/advanced-analytics.md + - name: Data lakes + href: scenarios/data-lake.md + - name: Free-form text search + href: scenarios/search.md + - name: Interactive data exploration + href: scenarios/interactive-data-exploration.md + - name: Natural language processing + href: scenarios/natural-language-processing.md + - name: Time series solutions + href: scenarios/time-series.md + - name: Working with CSV and JSON files + href: scenarios/csv-and-json.md + - name: Technology choices + items: + - name: Analytical data stores + href: technology-choices/analytical-data-stores.md + - name: Analytics and reporting + href: technology-choices/analysis-visualizations-reporting.md + - name: Batch processing + href: technology-choices/batch-processing.md + - name: Cognitive services + href: technology-choices/cognitive-services.md + - name: Data storage + href: technology-choices/data-storage.md + - name: Machine learning + href: technology-choices/data-science-and-machine-learning.md + - name: Natural language processing + href: technology-choices/natural-language-processing.md + - name: Pipeline orchestration + href: technology-choices/pipeline-orchestration-data-movement.md + - name: Real-time message ingestion + href: technology-choices/real-time-ingestion.md + - name: Search data stores + href: technology-choices/search-options.md + - name: Stream processing + href: technology-choices/stream-processing.md - name: Cross-cutting concerns items: - name: Data transfer