From 3a2370ec53aa67809b02deb987cf1b084ac4e83d Mon Sep 17 00:00:00 2001 From: Kyle Hounslow Date: Mon, 8 Dec 2025 12:03:25 -0800 Subject: [PATCH 1/3] Migrate PPL Documentation from RST to Markdown Signed-off-by: Kyle Hounslow --- DEVELOPER_GUIDE.rst | 2 +- docs/category.json | 109 +- docs/dev/ppl-commands.md | 2 +- docs/dev/testing-doctest.md | 47 + .../admin/connectors/prometheus_connector.md | 326 ++ .../admin/connectors/prometheus_connector.rst | 279 -- .../ppl/admin/connectors/s3glue_connector.md | 77 + .../ppl/admin/connectors/s3glue_connector.rst | 92 - .../connectors/security_lake_connector.md | 63 + .../connectors/security_lake_connector.rst | 78 - docs/user/ppl/admin/cross_cluster_search.md | 89 + docs/user/ppl/admin/cross_cluster_search.rst | 96 - docs/user/ppl/admin/datasources.md | 304 ++ docs/user/ppl/admin/datasources.rst | 290 -- docs/user/ppl/admin/monitoring.md | 35 + docs/user/ppl/admin/monitoring.rst | 56 - docs/user/ppl/admin/security.md | 65 + docs/user/ppl/admin/security.rst | 70 - docs/user/ppl/admin/settings.md | 441 +++ docs/user/ppl/admin/settings.rst | 427 --- docs/user/ppl/cmd/ad.md | 124 + docs/user/ppl/cmd/ad.rst | 112 - docs/user/ppl/cmd/append.md | 63 + docs/user/ppl/cmd/append.rst | 66 - docs/user/ppl/cmd/appendcol.md | 126 + docs/user/ppl/cmd/appendcol.rst | 110 - docs/user/ppl/cmd/appendpipe.md | 70 + docs/user/ppl/cmd/appendpipe.rst | 68 - docs/user/ppl/cmd/bin.md | 469 +++ docs/user/ppl/cmd/bin.rst | 348 --- docs/user/ppl/cmd/chart.md | 200 ++ docs/user/ppl/cmd/chart.rst | 193 -- docs/user/ppl/cmd/dedup.md | 134 + docs/user/ppl/cmd/dedup.rst | 111 - docs/user/ppl/cmd/describe.md | 67 + docs/user/ppl/cmd/describe.rst | 70 - docs/user/ppl/cmd/eval.md | 132 + docs/user/ppl/cmd/eval.rst | 120 - docs/user/ppl/cmd/eventstats.md | 166 + docs/user/ppl/cmd/eventstats.rst | 162 - docs/user/ppl/cmd/expand.md | 50 + docs/user/ppl/cmd/expand.rst | 61 - docs/user/ppl/cmd/explain.md | 181 ++ docs/user/ppl/cmd/explain.rst | 190 -- docs/user/ppl/cmd/fields.md | 244 ++ docs/user/ppl/cmd/fields.rst | 206 -- docs/user/ppl/cmd/fillnull.md | 176 ++ docs/user/ppl/cmd/fillnull.rst | 156 - docs/user/ppl/cmd/flatten.md | 93 + docs/user/ppl/cmd/flatten.rst | 101 - docs/user/ppl/cmd/grok.md | 86 + docs/user/ppl/cmd/grok.rst | 81 - docs/user/ppl/cmd/head.md | 84 + docs/user/ppl/cmd/head.rst | 77 - docs/user/ppl/cmd/join.md | 214 ++ docs/user/ppl/cmd/join.rst | 198 -- docs/user/ppl/cmd/kmeans.md | 37 + docs/user/ppl/cmd/kmeans.rst | 44 - docs/user/ppl/cmd/lookup.md | 339 ++ docs/user/ppl/cmd/lookup.rst | 350 --- docs/user/ppl/cmd/ml.md | 153 + docs/user/ppl/cmd/ml.rst | 138 - docs/user/ppl/cmd/multisearch.md | 152 + docs/user/ppl/cmd/multisearch.rst | 126 - docs/user/ppl/cmd/parse.md | 133 + docs/user/ppl/cmd/parse.rst | 119 - docs/user/ppl/cmd/patterns.md | 260 ++ docs/user/ppl/cmd/patterns.rst | 225 -- docs/user/ppl/cmd/rare.md | 146 + docs/user/ppl/cmd/rare.rst | 132 - docs/user/ppl/cmd/regex.md | 155 + docs/user/ppl/cmd/regex.rst | 140 - docs/user/ppl/cmd/rename.md | 142 + docs/user/ppl/cmd/rename.rst | 130 - docs/user/ppl/cmd/replace.md | 330 ++ docs/user/ppl/cmd/replace.rst | 268 -- docs/user/ppl/cmd/reverse.md | 134 + docs/user/ppl/cmd/reverse.rst | 115 - docs/user/ppl/cmd/rex.md | 291 ++ docs/user/ppl/cmd/rex.rst | 235 -- docs/user/ppl/cmd/search.md | 745 +++++ docs/user/ppl/cmd/search.rst | 556 ---- docs/user/ppl/cmd/showdatasources.md | 32 + docs/user/ppl/cmd/showdatasources.rst | 38 - docs/user/ppl/cmd/sort.md | 256 ++ docs/user/ppl/cmd/sort.rst | 208 -- docs/user/ppl/cmd/spath.md | 110 + docs/user/ppl/cmd/spath.rst | 92 - docs/user/ppl/cmd/stats.md | 487 +++ docs/user/ppl/cmd/stats.rst | 409 --- docs/user/ppl/cmd/streamstats.md | 281 ++ docs/user/ppl/cmd/streamstats.rst | 273 -- docs/user/ppl/cmd/subquery.md | 197 ++ docs/user/ppl/cmd/subquery.rst | 206 -- docs/user/ppl/cmd/syntax.md | 18 + docs/user/ppl/cmd/syntax.rst | 30 - docs/user/ppl/cmd/table.md | 37 + docs/user/ppl/cmd/table.rst | 44 - docs/user/ppl/cmd/timechart.md | 375 +++ docs/user/ppl/cmd/timechart.rst | 351 --- docs/user/ppl/cmd/top.md | 164 + docs/user/ppl/cmd/top.rst | 145 - docs/user/ppl/cmd/trendline.md | 114 + docs/user/ppl/cmd/trendline.rst | 103 - docs/user/ppl/cmd/where.md | 207 ++ docs/user/ppl/cmd/where.rst | 165 - docs/user/ppl/functions/aggregations.md | 653 ++++ docs/user/ppl/functions/aggregations.rst | 522 ---- docs/user/ppl/functions/collection.md | 727 +++++ docs/user/ppl/functions/collection.rst | 450 --- docs/user/ppl/functions/condition.md | 803 +++++ docs/user/ppl/functions/condition.rst | 615 ---- docs/user/ppl/functions/conversion.md | 272 ++ docs/user/ppl/functions/conversion.rst | 203 -- docs/user/ppl/functions/cryptographic.md | 101 + docs/user/ppl/functions/cryptographic.rst | 90 - docs/user/ppl/functions/datetime.md | 2782 +++++++++++++++++ docs/user/ppl/functions/datetime.rst | 2360 -------------- docs/user/ppl/functions/expressions.md | 185 ++ docs/user/ppl/functions/expressions.rst | 177 -- docs/user/ppl/functions/ip.md | 61 + docs/user/ppl/functions/ip.rst | 69 - docs/user/ppl/functions/json.md | 502 +++ docs/user/ppl/functions/json.rst | 363 --- docs/user/ppl/functions/math.md | 1187 +++++++ docs/user/ppl/functions/math.rst | 1045 ------- docs/user/ppl/functions/relevance.md | 505 +++ docs/user/ppl/functions/relevance.rst | 424 --- docs/user/ppl/functions/statistical.md | 142 + docs/user/ppl/functions/statistical.rst | 109 - docs/user/ppl/functions/string.md | 549 ++++ docs/user/ppl/functions/string.rst | 479 --- docs/user/ppl/functions/system.md | 29 + docs/user/ppl/functions/system.rst | 31 - docs/user/ppl/general/comments.md | 49 + docs/user/ppl/general/comments.rst | 44 - docs/user/ppl/general/datatypes.md | 327 ++ docs/user/ppl/general/datatypes.rst | 392 --- docs/user/ppl/general/identifiers.md | 188 ++ docs/user/ppl/general/identifiers.rst | 188 -- docs/user/ppl/index.md | 100 + docs/user/ppl/index.rst | 137 - docs/user/ppl/interfaces/endpoint.md | 154 + docs/user/ppl/interfaces/endpoint.rst | 150 - docs/user/ppl/interfaces/protocol.md | 130 + docs/user/ppl/interfaces/protocol.rst | 137 - docs/user/ppl/limitations/limitations.md | 89 + docs/user/ppl/limitations/limitations.rst | 132 - .../reference/splunk_to_ppl_cheat_sheet.md | 193 +- doctest/markdown_parser.py | 286 ++ doctest/test_docs.py | 227 +- scripts/docs_exporter/convert_rst_to_md.py | 536 ++++ .../docs_exporter/export_to_docs_website.py | 104 + .../docs_exporter/fix_markdown_formatting.py | 161 + 154 files changed, 20377 insertions(+), 16776 deletions(-) create mode 100644 docs/user/ppl/admin/connectors/prometheus_connector.md delete mode 100644 docs/user/ppl/admin/connectors/prometheus_connector.rst create mode 100644 docs/user/ppl/admin/connectors/s3glue_connector.md delete mode 100644 docs/user/ppl/admin/connectors/s3glue_connector.rst create mode 100644 docs/user/ppl/admin/connectors/security_lake_connector.md delete mode 100644 docs/user/ppl/admin/connectors/security_lake_connector.rst create mode 100644 docs/user/ppl/admin/cross_cluster_search.md delete mode 100644 docs/user/ppl/admin/cross_cluster_search.rst create mode 100644 docs/user/ppl/admin/datasources.md delete mode 100644 docs/user/ppl/admin/datasources.rst create mode 100644 docs/user/ppl/admin/monitoring.md delete mode 100644 docs/user/ppl/admin/monitoring.rst create mode 100644 docs/user/ppl/admin/security.md delete mode 100644 docs/user/ppl/admin/security.rst create mode 100644 docs/user/ppl/admin/settings.md delete mode 100644 docs/user/ppl/admin/settings.rst create mode 100644 docs/user/ppl/cmd/ad.md delete mode 100644 docs/user/ppl/cmd/ad.rst create mode 100644 docs/user/ppl/cmd/append.md delete mode 100644 docs/user/ppl/cmd/append.rst create mode 100644 docs/user/ppl/cmd/appendcol.md delete mode 100644 docs/user/ppl/cmd/appendcol.rst create mode 100644 docs/user/ppl/cmd/appendpipe.md delete mode 100644 docs/user/ppl/cmd/appendpipe.rst create mode 100644 docs/user/ppl/cmd/bin.md delete mode 100644 docs/user/ppl/cmd/bin.rst create mode 100644 docs/user/ppl/cmd/chart.md delete mode 100644 docs/user/ppl/cmd/chart.rst create mode 100644 docs/user/ppl/cmd/dedup.md delete mode 100644 docs/user/ppl/cmd/dedup.rst create mode 100644 docs/user/ppl/cmd/describe.md delete mode 100644 docs/user/ppl/cmd/describe.rst create mode 100644 docs/user/ppl/cmd/eval.md delete mode 100644 docs/user/ppl/cmd/eval.rst create mode 100644 docs/user/ppl/cmd/eventstats.md delete mode 100644 docs/user/ppl/cmd/eventstats.rst create mode 100644 docs/user/ppl/cmd/expand.md delete mode 100644 docs/user/ppl/cmd/expand.rst create mode 100644 docs/user/ppl/cmd/explain.md delete mode 100644 docs/user/ppl/cmd/explain.rst create mode 100644 docs/user/ppl/cmd/fields.md delete mode 100644 docs/user/ppl/cmd/fields.rst create mode 100644 docs/user/ppl/cmd/fillnull.md delete mode 100644 docs/user/ppl/cmd/fillnull.rst create mode 100644 docs/user/ppl/cmd/flatten.md delete mode 100644 docs/user/ppl/cmd/flatten.rst create mode 100644 docs/user/ppl/cmd/grok.md delete mode 100644 docs/user/ppl/cmd/grok.rst create mode 100644 docs/user/ppl/cmd/head.md delete mode 100644 docs/user/ppl/cmd/head.rst create mode 100644 docs/user/ppl/cmd/join.md delete mode 100644 docs/user/ppl/cmd/join.rst create mode 100644 docs/user/ppl/cmd/kmeans.md delete mode 100644 docs/user/ppl/cmd/kmeans.rst create mode 100644 docs/user/ppl/cmd/lookup.md delete mode 100644 docs/user/ppl/cmd/lookup.rst create mode 100644 docs/user/ppl/cmd/ml.md delete mode 100644 docs/user/ppl/cmd/ml.rst create mode 100644 docs/user/ppl/cmd/multisearch.md delete mode 100644 docs/user/ppl/cmd/multisearch.rst create mode 100644 docs/user/ppl/cmd/parse.md delete mode 100644 docs/user/ppl/cmd/parse.rst create mode 100644 docs/user/ppl/cmd/patterns.md delete mode 100644 docs/user/ppl/cmd/patterns.rst create mode 100644 docs/user/ppl/cmd/rare.md delete mode 100644 docs/user/ppl/cmd/rare.rst create mode 100644 docs/user/ppl/cmd/regex.md delete mode 100644 docs/user/ppl/cmd/regex.rst create mode 100644 docs/user/ppl/cmd/rename.md delete mode 100644 docs/user/ppl/cmd/rename.rst create mode 100644 docs/user/ppl/cmd/replace.md delete mode 100644 docs/user/ppl/cmd/replace.rst create mode 100644 docs/user/ppl/cmd/reverse.md delete mode 100644 docs/user/ppl/cmd/reverse.rst create mode 100644 docs/user/ppl/cmd/rex.md delete mode 100644 docs/user/ppl/cmd/rex.rst create mode 100644 docs/user/ppl/cmd/search.md delete mode 100644 docs/user/ppl/cmd/search.rst create mode 100644 docs/user/ppl/cmd/showdatasources.md delete mode 100644 docs/user/ppl/cmd/showdatasources.rst create mode 100644 docs/user/ppl/cmd/sort.md delete mode 100644 docs/user/ppl/cmd/sort.rst create mode 100644 docs/user/ppl/cmd/spath.md delete mode 100644 docs/user/ppl/cmd/spath.rst create mode 100644 docs/user/ppl/cmd/stats.md delete mode 100644 docs/user/ppl/cmd/stats.rst create mode 100644 docs/user/ppl/cmd/streamstats.md delete mode 100644 docs/user/ppl/cmd/streamstats.rst create mode 100644 docs/user/ppl/cmd/subquery.md delete mode 100644 docs/user/ppl/cmd/subquery.rst create mode 100644 docs/user/ppl/cmd/syntax.md delete mode 100644 docs/user/ppl/cmd/syntax.rst create mode 100644 docs/user/ppl/cmd/table.md delete mode 100644 docs/user/ppl/cmd/table.rst create mode 100644 docs/user/ppl/cmd/timechart.md delete mode 100644 docs/user/ppl/cmd/timechart.rst create mode 100644 docs/user/ppl/cmd/top.md delete mode 100644 docs/user/ppl/cmd/top.rst create mode 100644 docs/user/ppl/cmd/trendline.md delete mode 100644 docs/user/ppl/cmd/trendline.rst create mode 100644 docs/user/ppl/cmd/where.md delete mode 100644 docs/user/ppl/cmd/where.rst create mode 100644 docs/user/ppl/functions/aggregations.md delete mode 100644 docs/user/ppl/functions/aggregations.rst create mode 100644 docs/user/ppl/functions/collection.md delete mode 100644 docs/user/ppl/functions/collection.rst create mode 100644 docs/user/ppl/functions/condition.md delete mode 100644 docs/user/ppl/functions/condition.rst create mode 100644 docs/user/ppl/functions/conversion.md delete mode 100644 docs/user/ppl/functions/conversion.rst create mode 100644 docs/user/ppl/functions/cryptographic.md delete mode 100644 docs/user/ppl/functions/cryptographic.rst create mode 100644 docs/user/ppl/functions/datetime.md delete mode 100644 docs/user/ppl/functions/datetime.rst create mode 100644 docs/user/ppl/functions/expressions.md delete mode 100644 docs/user/ppl/functions/expressions.rst create mode 100644 docs/user/ppl/functions/ip.md delete mode 100644 docs/user/ppl/functions/ip.rst create mode 100644 docs/user/ppl/functions/json.md delete mode 100644 docs/user/ppl/functions/json.rst create mode 100644 docs/user/ppl/functions/math.md delete mode 100644 docs/user/ppl/functions/math.rst create mode 100644 docs/user/ppl/functions/relevance.md delete mode 100644 docs/user/ppl/functions/relevance.rst create mode 100644 docs/user/ppl/functions/statistical.md delete mode 100644 docs/user/ppl/functions/statistical.rst create mode 100644 docs/user/ppl/functions/string.md delete mode 100644 docs/user/ppl/functions/string.rst create mode 100644 docs/user/ppl/functions/system.md delete mode 100644 docs/user/ppl/functions/system.rst create mode 100644 docs/user/ppl/general/comments.md delete mode 100644 docs/user/ppl/general/comments.rst create mode 100644 docs/user/ppl/general/datatypes.md delete mode 100644 docs/user/ppl/general/datatypes.rst create mode 100644 docs/user/ppl/general/identifiers.md delete mode 100644 docs/user/ppl/general/identifiers.rst create mode 100644 docs/user/ppl/index.md delete mode 100644 docs/user/ppl/index.rst create mode 100644 docs/user/ppl/interfaces/endpoint.md delete mode 100644 docs/user/ppl/interfaces/endpoint.rst create mode 100644 docs/user/ppl/interfaces/protocol.md delete mode 100644 docs/user/ppl/interfaces/protocol.rst create mode 100644 docs/user/ppl/limitations/limitations.md delete mode 100644 docs/user/ppl/limitations/limitations.rst create mode 100644 doctest/markdown_parser.py create mode 100644 scripts/docs_exporter/convert_rst_to_md.py create mode 100755 scripts/docs_exporter/export_to_docs_website.py create mode 100755 scripts/docs_exporter/fix_markdown_formatting.py diff --git a/DEVELOPER_GUIDE.rst b/DEVELOPER_GUIDE.rst index 92304c51606..a179b1fc64d 100644 --- a/DEVELOPER_GUIDE.rst +++ b/DEVELOPER_GUIDE.rst @@ -172,7 +172,7 @@ Here are other files and sub-folders that you are likely to touch: - ``build.gradle``: Gradle build script. - ``docs``: documentation for developers and reference manual for users. -- ``doc-test``: code that run .rst docs in ``docs`` folder by Python doctest library. +- ``doctest``: code that runs .rst and .md docs in ``docs`` folder by Python doctest library. Note that other related project code has already merged into this single repository together: diff --git a/docs/category.json b/docs/category.json index f3fe70ecfa5..bf1f9b1d22d 100644 --- a/docs/category.json +++ b/docs/category.json @@ -4,8 +4,61 @@ "user/admin/settings.rst" ], "bash_calcite": [ - "user/ppl/interfaces/endpoint.rst", - "user/ppl/interfaces/protocol.rst" + "user/ppl/interfaces/endpoint.md", + "user/ppl/interfaces/protocol.md" + ], + "ppl_cli_calcite": [ + "user/ppl/cmd/ad.md", + "user/ppl/cmd/append.md", + "user/ppl/cmd/bin.md", + "user/ppl/cmd/dedup.md", + "user/ppl/cmd/describe.md", + "user/ppl/cmd/eventstats.md", + "user/ppl/cmd/eval.md", + "user/ppl/cmd/fields.md", + "user/ppl/cmd/fillnull.md", + "user/ppl/cmd/grok.md", + "user/ppl/cmd/head.md", + "user/ppl/cmd/join.md", + "user/ppl/cmd/lookup.md", + "user/ppl/cmd/parse.md", + "user/ppl/cmd/patterns.md", + "user/ppl/cmd/rare.md", + "user/ppl/cmd/regex.md", + "user/ppl/cmd/rename.md", + "user/ppl/cmd/multisearch.md", + "user/ppl/cmd/replace.md", + "user/ppl/cmd/rex.md", + "user/ppl/cmd/search.md", + "user/ppl/cmd/showdatasources.md", + "user/ppl/cmd/sort.md", + "user/ppl/cmd/spath.md", + "user/ppl/cmd/stats.md", + "user/ppl/cmd/streamstats.md", + "user/ppl/cmd/subquery.md", + "user/ppl/cmd/syntax.md", + "user/ppl/cmd/chart.md", + "user/ppl/cmd/timechart.md", + "user/ppl/cmd/top.md", + "user/ppl/cmd/trendline.md", + "user/ppl/cmd/where.md", + "user/ppl/functions/aggregations.md", + "user/ppl/functions/collection.md", + "user/ppl/functions/condition.md", + "user/ppl/functions/conversion.md", + "user/ppl/functions/cryptographic.md", + "user/ppl/functions/datetime.md", + "user/ppl/functions/expressions.md", + "user/ppl/functions/ip.md", + "user/ppl/functions/json.md", + "user/ppl/functions/math.md", + "user/ppl/functions/relevance.md", + "user/ppl/functions/statistical.md", + "user/ppl/functions/string.md", + "user/ppl/functions/system.md", + "user/ppl/general/comments.md", + "user/ppl/general/datatypes.md", + "user/ppl/general/identifiers.md" ], "sql_cli": [ "user/dql/expressions.rst", @@ -21,57 +74,7 @@ "user/dql/complex.rst", "user/dql/metadata.rst" ], - "ppl_cli_calcite": [ - "user/ppl/cmd/ad.rst", - "user/ppl/cmd/append.rst", - "user/ppl/cmd/bin.rst", - "user/ppl/cmd/dedup.rst", - "user/ppl/cmd/describe.rst", - "user/ppl/cmd/eventstats.rst", - "user/ppl/cmd/eval.rst", - "user/ppl/cmd/fields.rst", - "user/ppl/cmd/fillnull.rst", - "user/ppl/cmd/grok.rst", - "user/ppl/cmd/head.rst", - "user/ppl/cmd/join.rst", - "user/ppl/cmd/lookup.rst", - "user/ppl/cmd/parse.rst", - "user/ppl/cmd/patterns.rst", - "user/ppl/cmd/rare.rst", - "user/ppl/cmd/regex.rst", - "user/ppl/cmd/rename.rst", - "user/ppl/cmd/multisearch.rst", - "user/ppl/cmd/replace.rst", - "user/ppl/cmd/rex.rst", - "user/ppl/cmd/search.rst", - "user/ppl/cmd/showdatasources.rst", - "user/ppl/cmd/sort.rst", - "user/ppl/cmd/spath.rst", - "user/ppl/cmd/stats.rst", - "user/ppl/cmd/streamstats.rst", - "user/ppl/cmd/subquery.rst", - "user/ppl/cmd/syntax.rst", - "user/ppl/cmd/chart.rst", - "user/ppl/cmd/timechart.rst", - "user/ppl/cmd/search.rst", - "user/ppl/functions/statistical.rst", - "user/ppl/cmd/top.rst", - "user/ppl/cmd/trendline.rst", - "user/ppl/cmd/where.rst", - "user/ppl/functions/collection.rst", - "user/ppl/functions/condition.rst", - "user/ppl/functions/datetime.rst", - "user/ppl/functions/expressions.rst", - "user/ppl/functions/ip.rst", - "user/ppl/functions/json.rst", - "user/ppl/functions/math.rst", - "user/ppl/functions/relevance.rst", - "user/ppl/functions/string.rst", - "user/ppl/functions/conversion.rst", - "user/ppl/general/datatypes.rst", - "user/ppl/general/identifiers.rst" - ], "bash_settings": [ - "user/ppl/admin/settings.rst" + "user/ppl/admin/settings.md" ] } diff --git a/docs/dev/ppl-commands.md b/docs/dev/ppl-commands.md index 9d62e607f86..ea727e234a5 100644 --- a/docs/dev/ppl-commands.md +++ b/docs/dev/ppl-commands.md @@ -54,4 +54,4 @@ If you are working on contributing a new PPL command, please read this guide and - Add a test in `CrossClusterSearchIT` - [ ] **User doc:** - - Add a xxx.rst under `docs/user/ppl/cmd` and link the new doc to `docs/user/ppl/index.rst` + - Add a xxx.md under `docs/user/ppl/cmd` and link the new doc to `docs/user/ppl/index.md` diff --git a/docs/dev/testing-doctest.md b/docs/dev/testing-doctest.md index 1a966ba50c3..55d73d9f02a 100644 --- a/docs/dev/testing-doctest.md +++ b/docs/dev/testing-doctest.md @@ -57,11 +57,58 @@ Doctest runs with project build by `./gradlew build`. You can also only run doct Make sure you don't have any OpenSearch instance running at `http://localhost:9200` ### 1.4.2 How to write documentation with doctest? + +#### RST Format (SQL docs only. On Deprecation path. Use markdown for PPL) 1. If you want to add a new doc, you can add it to `docs` folder, under correct sub-folder, in `.rst` format. > **Attention**: For code examples in documentation, a Mixing usage of `cli` and `bash` in one doc is not supported yet. 2. Add your new doc file path to `docs/category.json` by its category 3. Run doctest `./gradlew doctest` (optionally with `-DignorePrometheus`) to see if your tests can pass +#### Markdown Format (New - Currently for docs/user/ppl only) +For PPL documentation, Markdown format is now supported with the following guidelines: + +1. **File Format**: Create `.md` file(s) in `docs/user/ppl` folder +2. **Category Configuration**: Add markdown files to markdown-only categories in `docs/category.json`: + - `ppl_cli_calcite`: PPL CLI examples with Calcite engine + - `bash_calcite`: Bash/curl examples with Calcite engine + - `bash_settings`: Bash examples for settings/configuration + +3. **Code Block Format**: Use **paired** fenced code blocks - each input block must be followed by its expected output block: + +```ppl +search source=accounts | where age > 25 | fields firstname, lastname +``` + +Expected output: + +```text ++-------------+------------+ +| firstname | lastname | +|-------------+------------| +| Amber | Duke | +| Hattie | Bond | ++-------------+------------+ +``` + +**Input/Output Pairs**: Each input code fence must be immediately followed by an "Expected output:" section with an output code fence +- **Supported Input Languages**: `sql`, `ppl`, `bash`, `sh`, `bash ppl` +- **Supported Output Languages**: `text`, `console`, `output`, `json`, `yaml` + +4. **Ignoring Tests**: To skip specific code blocks from testing, add `ignore` attribute: + +```ppl ignore +search source=accounts | head 5 +``` + +Expected output: + +```text +This output won't be tested +``` + +5. **Validation**: Markdown categories only accept `.md` files - mixing with `.rst` files will cause validation errors +6. **Testing**: Run `./gradlew doctest` to validate your markdown documentation + Currently, there is a `sample` folder under `docs` module to help you get started. ## 1.5 Future Plan diff --git a/docs/user/ppl/admin/connectors/prometheus_connector.md b/docs/user/ppl/admin/connectors/prometheus_connector.md new file mode 100644 index 00000000000..fab00fb21bb --- /dev/null +++ b/docs/user/ppl/admin/connectors/prometheus_connector.md @@ -0,0 +1,326 @@ +# Prometheus Connector + +## Introduction + +This page covers prometheus connector properties for dataSource configuration +and the nuances associated with prometheus connector. +## Prometheus Connector Properties in DataSource Configuration + +Prometheus Connector Properties. +* `prometheus.uri` [Required]. + * This parameters provides the URI information to connect to a prometheus instance. +* `prometheus.auth.type` [Optional] + * This parameters provides the authentication type information. + * Prometheus connector currently supports `basicauth` and `awssigv4` authentication mechanisms. + * If prometheus.auth.type is basicauth, following are required parameters. + * `prometheus.auth.username` and `prometheus.auth.password`. + * If prometheus.auth.type is awssigv4, following are required parameters. + * `prometheus.auth.region`, `prometheus.auth.access_key` and `prometheus.auth.secret_key` + +## Example prometheus dataSource configuration with different authentications + +No Auth + +```bash +[{ + "name" : "my_prometheus", + "connector": "prometheus", + "properties" : { + "prometheus.uri" : "http://localhost:9090" + } +}] + +``` + +Basic Auth + +```bash +[{ + "name" : "my_prometheus", + "connector": "prometheus", + "properties" : { + "prometheus.uri" : "http://localhost:9090", + "prometheus.auth.type" : "basicauth", + "prometheus.auth.username" : "admin", + "prometheus.auth.password" : "admin" + } +}] + +``` + +AWSSigV4 Auth + +```bash +[{ + "name" : "my_prometheus", + "connector": "prometheus", + "properties" : { + "prometheus.uri" : "http://localhost:8080", + "prometheus.auth.type" : "awssigv4", + "prometheus.auth.region" : "us-east-1", + "prometheus.auth.access_key" : "{{accessKey}}", + "prometheus.auth.secret_key" : "{{secretKey}}" + } +}] + +``` + +## PPL Query support for prometheus connector + +### Metric as a Table + +Each connector has to abstract the underlying datasource constructs into a table as part of the interface contract with the PPL query engine. +Prometheus connector abstracts each metric as a table and the columns of this table are `@value`, `@timestamp`, `label1`, `label2`---. +`@value` represents metric measurement and `@timestamp` represents the timestamp at which the metric is collected. labels are tags associated with metric queried. +For eg: `handler`, `code`, `instance`, `code` are the labels associated with `prometheus_http_requests_total` metric. With this abstraction, we can query prometheus +data using PPL syntax similar to opensearch indices. +Sample Example + +```ppl +source = my_prometheus.prometheus_http_requests_total +``` + +Expected output: + +```text ++--------+-----------------------+--------------+------+------------+------------+ +| @value | @timestamp | handler | code | instance | job | +|--------+-----------------------+--------------+------+------------+------------| +| 5 | "2022-11-03 07:18:14" | "/-/ready" | 200 | 192.15.1.1 | prometheus | +| 3 | "2022-11-03 07:18:24" | "/-/ready" | 200 | 192.15.1.1 | prometheus | +| 7 | "2022-11-03 07:18:34" | "/-/ready" | 200 | 192.15.1.1 | prometheus | +| 2 | "2022-11-03 07:18:44" | "/-/ready" | 400 | 192.15.2.1 | prometheus | +| 9 | "2022-11-03 07:18:54" | "/-/promql" | 400 | 192.15.2.1 | prometheus | +| 11 | "2022-11-03 07:18:64" | "/-/metrics" | 500 | 192.15.2.1 | prometheus | ++--------+-----------------------+--------------+------+------------+------------+ +``` + +### Default time range and resolution + +Since time range and resolution are required parameters for query apis and these parameters are determined in the following manner from the PPL commands. +* Time range is determined through filter clause on `@timestamp`. If there is no such filter clause, time range will be set to 1h with endtime set to now(). +* In case of stats, resolution is determined by `span(@timestamp,15s)` expression. For normal select queries, resolution is auto determined from the time range set. + +### Prometheus Connector Limitations + +* Only one aggregation is supported in stats command. +* Span Expression is compulsory in stats command. +* AVG, MAX, MIN, SUM, COUNT are the only aggregations supported in prometheus connector. +* Where clause only supports EQUALS(=) operation on metric dimensions and Comparative(> , < , >= , <=) Operations on @timestamp attribute. + +### Example queries + +1. Metric Selection Query + +```ppl +source = my_prometheus.prometheus_http_requests_total +``` + +Expected output: + +```text ++--------+-----------------------+--------------+------+------------+------------+ +| @value | @timestamp | handler | code | instance | job | +|--------+-----------------------+--------------+------+------------+------------| +| 5 | "2022-11-03 07:18:14" | "/-/ready" | 200 | 192.15.1.1 | prometheus | +| 3 | "2022-11-03 07:18:24" | "/-/ready" | 200 | 192.15.1.1 | prometheus | +| 7 | "2022-11-03 07:18:34" | "/-/ready" | 200 | 192.15.1.1 | prometheus | +| 2 | "2022-11-03 07:18:44" | "/-/ready" | 400 | 192.15.2.1 | prometheus | +| 9 | "2022-11-03 07:18:54" | "/-/promql" | 400 | 192.15.2.1 | prometheus | +| 11 | "2022-11-03 07:18:64" | "/-/metrics" | 500 | 192.15.2.1 | prometheus | ++--------+-----------------------+--------------+------+------------+------------+ +``` + +2. Metric Selecting Query with specific dimensions + +```ppl +source = my_prometheus.prometheus_http_requests_total +| where handler='/-/ready' and code='200' +``` + +Expected output: + +```text ++--------+-----------------------+------------+------+------------+------------+ +| @value | @timestamp | handler | code | instance | job | +|--------+-----------------------+------------+------+------------+------------| +| 5 | "2022-11-03 07:18:14" | "/-/ready" | 200 | 192.15.1.1 | prometheus | +| 3 | "2022-11-03 07:18:24" | "/-/ready" | 200 | 192.15.1.1 | prometheus | +| 7 | "2022-11-03 07:18:34" | "/-/ready" | 200 | 192.15.1.1 | prometheus | +| 2 | "2022-11-03 07:18:44" | "/-/ready" | 200 | 192.15.2.1 | prometheus | +| 9 | "2022-11-03 07:18:54" | "/-/ready" | 200 | 192.15.2.1 | prometheus | +| 11 | "2022-11-03 07:18:64" | "/-/ready" | 200 | 192.15.2.1 | prometheus | ++--------+-----------------------+------------+------+------------+------------+ +``` + +3. Average aggregation on a metric + +```ppl +source = my_prometheus.prometheus_http_requests_total +| stats avg(@value) by span(@timestamp,15s) +``` + +Expected output: + +```text ++------------+------------------------+ +| avg(@value)| span(@timestamp,15s) | +|------------+------------------------+ +| 5 | "2022-11-03 07:18:14" | +| 3 | "2022-11-03 07:18:24" | +| 7 | "2022-11-03 07:18:34" | +| 2 | "2022-11-03 07:18:44" | +| 9 | "2022-11-03 07:18:54" | +| 11 | "2022-11-03 07:18:64" | ++------------+------------------------+ +``` + +4. Average aggregation grouped by dimensions + +```ppl +source = my_prometheus.prometheus_http_requests_total +| stats avg(@value) by span(@timestamp,15s), handler, code +``` + +Expected output: + +```text ++------------+------------------------+--------------------------------+---------------+ +| avg(@value)| span(@timestamp,15s) | handler | code | +|------------+------------------------+--------------------------------+---------------+ +| 5 | "2022-11-03 07:18:14" | "/-/ready" | 200 | +| 3 | "2022-11-03 07:18:24" | "/-/ready" | 200 | +| 7 | "2022-11-03 07:18:34" | "/-/ready" | 200 | +| 2 | "2022-11-03 07:18:44" | "/-/ready" | 400 | +| 9 | "2022-11-03 07:18:54" | "/-/promql" | 400 | +| 11 | "2022-11-03 07:18:64" | "/-/metrics" | 500 | ++------------+------------------------+--------------------------------+---------------+ +``` + +5. Count aggregation query + +```ppl +source = my_prometheus.prometheus_http_requests_total +| stats count() by span(@timestamp,15s), handler, code +``` + +Expected output: + +```text ++------------+------------------------+--------------------------------+---------------+ +| count() | span(@timestamp,15s) | handler | code | +|------------+------------------------+--------------------------------+---------------+ +| 5 | "2022-11-03 07:18:14" | "/-/ready" | 200 | +| 3 | "2022-11-03 07:18:24" | "/-/ready" | 200 | +| 7 | "2022-11-03 07:18:34" | "/-/ready" | 200 | +| 2 | "2022-11-03 07:18:44" | "/-/ready" | 400 | +| 9 | "2022-11-03 07:18:54" | "/-/promql" | 400 | +| 11 | "2022-11-03 07:18:64" | "/-/metrics" | 500 | ++------------+------------------------+--------------------------------+---------------+ +``` + +## PromQL Support for prometheus Connector + +### `query_range` Table Function + +* Prometheus connector offers `query_range` table function. This table function can be used to query metrics in a specific time range using promQL. +* The function takes inputs similar to parameters mentioned for query range api mentioned here: [Prometheus query_range API](https://prometheus.io/docs/prometheus/latest/querying/api/) +* Arguments should be either passed by name or positionArguments should be either passed by name or position. + - `source=my_prometheus.query_range('prometheus_http_requests_total', 1686694425, 1686700130, 14)` + - `source=my_prometheus.query_range(query='prometheus_http_requests_total', starttime=1686694425, endtime=1686700130, step=14)` + +Example + +```ppl +source=my_prometheus.query_range('prometheus_http_requests_total', 1686694425, 1686700130, 14) +``` + +Expected output: + +```text ++--------+-----------------------+--------------+------+------------+------------+ +| @value | @timestamp | handler | code | instance | job | +|--------+-----------------------+--------------+------+------------+------------| +| 5 | "2022-11-03 07:18:14" | "/-/ready" | 200 | 192.15.1.1 | prometheus | +| 3 | "2022-11-03 07:18:24" | "/-/ready" | 200 | 192.15.1.1 | prometheus | +| 7 | "2022-11-03 07:18:34" | "/-/ready" | 200 | 192.15.1.1 | prometheus | +| 2 | "2022-11-03 07:18:44" | "/-/ready" | 400 | 192.15.2.1 | prometheus | +| 9 | "2022-11-03 07:18:54" | "/-/promql" | 400 | 192.15.2.1 | prometheus | +| 11 | "2022-11-03 07:18:64" | "/-/metrics" | 500 | 192.15.2.1 | prometheus | ++--------+-----------------------+--------------+------+------------+------------+ +``` + +## Prometheus Connector Table Functions + +### `query_exemplars` Table Function + +* This table function can be used to fetch exemplars of a query in a specific time range. +* The function takes inputs similar to parameters mentioned for query exemplars api mentioned here: [Prometheus query_exemplars API](https://prometheus.io/docs/prometheus/latest/querying/api/) +* Arguments should be either passed by name or positionArguments should be either passed by name or position. + - `source=my_prometheus.query_exemplars('prometheus_http_requests_total', 1686694425, 1686700130)` + - `source=my_prometheus.query_exemplars(query='prometheus_http_requests_total', starttime=1686694425, endtime=1686700130)` + +Example + +```ppl +source=my_prometheus.query_exemplars('prometheus_http_requests_total', 1686694425, 1686700130) +``` + +Expected output: + +```text + "schema": [ + { + "name": "seriesLabels", + "type": "struct" + }, + { + "name": "exemplars", + "type": "array" + } + ], + "datarows": [ + [ + { + "instance": "localhost:8090", + "__name__": "test_exemplar_metric_total", + "service": "bar", + "job": "prometheus" + }, + [ + { + "labels": { + "traceID": "EpTxMJ40fUus7aGY" + }, + "timestamp": "2020-09-14 15:22:25.479", + "value": 6.0 + } + ] + ], + [ + { + "instance": "localhost:8090", + "__name__": "test_exemplar_metric_total", + "service": "foo", + "job": "prometheus" + }, + [ + { + "labels": { + "traceID": "Olp9XHlq763ccsfa" + }, + "timestamp": "2020-09-14 15:22:35.479", + "value": 19.0 + }, + { + "labels": { + "traceID": "hCtjygkIHwAN9vs4" + }, + "timestamp": "2020-09-14 15:22:45.489", + "value": 20.0 + } + ] + ] + ] +``` + \ No newline at end of file diff --git a/docs/user/ppl/admin/connectors/prometheus_connector.rst b/docs/user/ppl/admin/connectors/prometheus_connector.rst deleted file mode 100644 index 812df4f8943..00000000000 --- a/docs/user/ppl/admin/connectors/prometheus_connector.rst +++ /dev/null @@ -1,279 +0,0 @@ -.. highlight:: sh - -==================== -Prometheus Connector -==================== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 1 - - -Introduction -============ - -This page covers prometheus connector properties for dataSource configuration -and the nuances associated with prometheus connector. - - -Prometheus Connector Properties in DataSource Configuration -======================================================== -Prometheus Connector Properties. - -* ``prometheus.uri`` [Required]. - * This parameters provides the URI information to connect to a prometheus instance. -* ``prometheus.auth.type`` [Optional] - * This parameters provides the authentication type information. - * Prometheus connector currently supports ``basicauth`` and ``awssigv4`` authentication mechanisms. - * If prometheus.auth.type is basicauth, following are required parameters. - * ``prometheus.auth.username`` and ``prometheus.auth.password``. - * If prometheus.auth.type is awssigv4, following are required parameters. - * ``prometheus.auth.region``, ``prometheus.auth.access_key`` and ``prometheus.auth.secret_key`` - -Example prometheus dataSource configuration with different authentications -======================================================================= - -No Auth :: - - [{ - "name" : "my_prometheus", - "connector": "prometheus", - "properties" : { - "prometheus.uri" : "http://localhost:9090" - } - }] - -Basic Auth :: - - [{ - "name" : "my_prometheus", - "connector": "prometheus", - "properties" : { - "prometheus.uri" : "http://localhost:9090", - "prometheus.auth.type" : "basicauth", - "prometheus.auth.username" : "admin", - "prometheus.auth.password" : "admin" - } - }] - -AWSSigV4 Auth:: - - [{ - "name" : "my_prometheus", - "connector": "prometheus", - "properties" : { - "prometheus.uri" : "http://localhost:8080", - "prometheus.auth.type" : "awssigv4", - "prometheus.auth.region" : "us-east-1", - "prometheus.auth.access_key" : "{{accessKey}}" - "prometheus.auth.secret_key" : "{{secretKey}}" - } - }] - -PPL Query support for prometheus connector -========================================== - -Metric as a Table ---------------------------- -Each connector has to abstract the underlying datasource constructs into a table as part of the interface contract with the PPL query engine. -Prometheus connector abstracts each metric as a table and the columns of this table are ``@value``, ``@timestamp``, ``label1``, ``label2``---. -``@value`` represents metric measurement and ``@timestamp`` represents the timestamp at which the metric is collected. labels are tags associated with metric queried. -For eg: ``handler``, ``code``, ``instance``, ``code`` are the labels associated with ``prometheus_http_requests_total`` metric. With this abstraction, we can query prometheus -data using PPL syntax similar to opensearch indices. - -Sample Example:: - - > source = my_prometheus.prometheus_http_requests_total; - - +--------+-----------------------+--------------+------+------------+------------+ - | @value | @timestamp | handler | code | instance | job | - |--------+-----------------------+--------------+------+------------+------------| - | 5 | "2022-11-03 07:18:14" | "/-/ready" | 200 | 192.15.1.1 | prometheus | - | 3 | "2022-11-03 07:18:24" | "/-/ready" | 200 | 192.15.1.1 | prometheus | - | 7 | "2022-11-03 07:18:34" | "/-/ready" | 200 | 192.15.1.1 | prometheus | - | 2 | "2022-11-03 07:18:44" | "/-/ready" | 400 | 192.15.2.1 | prometheus | - | 9 | "2022-11-03 07:18:54" | "/-/promql" | 400 | 192.15.2.1 | prometheus | - | 11 | "2022-11-03 07:18:64" | "/-/metrics" | 500 | 192.15.2.1 | prometheus | - +--------+-----------------------+--------------+------+------------+------------+ - - - -Default time range and resolution ---------------------------------- -Since time range and resolution are required parameters for query apis and these parameters are determined in the following manner from the PPL commands. -* Time range is determined through filter clause on ``@timestamp``. If there is no such filter clause, time range will be set to 1h with endtime set to now(). -* In case of stats, resolution is determined by ``span(@timestamp,15s)`` expression. For normal select queries, resolution is auto determined from the time range set. - -Prometheus Connector Limitations --------------------------------- -* Only one aggregation is supported in stats command. -* Span Expression is compulsory in stats command. -* AVG, MAX, MIN, SUM, COUNT are the only aggregations supported in prometheus connector. -* Where clause only supports EQUALS(=) operation on metric dimensions and Comparative(> , < , >= , <=) Operations on @timestamp attribute. - -Example queries ---------------- - -1. Metric Selection Query:: - - > source = my_prometheus.prometheus_http_requests_total - +--------+-----------------------+--------------+------+------------+------------+ - | @value | @timestamp | handler | code | instance | job | - |--------+-----------------------+--------------+------+------------+------------| - | 5 | "2022-11-03 07:18:14" | "/-/ready" | 200 | 192.15.1.1 | prometheus | - | 3 | "2022-11-03 07:18:24" | "/-/ready" | 200 | 192.15.1.1 | prometheus | - | 7 | "2022-11-03 07:18:34" | "/-/ready" | 200 | 192.15.1.1 | prometheus | - | 2 | "2022-11-03 07:18:44" | "/-/ready" | 400 | 192.15.2.1 | prometheus | - | 9 | "2022-11-03 07:18:54" | "/-/promql" | 400 | 192.15.2.1 | prometheus | - | 11 | "2022-11-03 07:18:64" | "/-/metrics" | 500 | 192.15.2.1 | prometheus | - +--------+-----------------------+--------------+------+------------+------------+ - -2. Metric Selecting Query with specific dimensions:: - - > source = my_prometheus.prometheus_http_requests_total | where handler='/-/ready' and code='200' - +--------+-----------------------+------------+------+------------+------------+ - | @value | @timestamp | handler | code | instance | job | - |--------+-----------------------+------------+------+------------+------------| - | 5 | "2022-11-03 07:18:14" | "/-/ready" | 200 | 192.15.1.1 | prometheus | - | 3 | "2022-11-03 07:18:24" | "/-/ready" | 200 | 192.15.1.1 | prometheus | - | 7 | "2022-11-03 07:18:34" | "/-/ready" | 200 | 192.15.1.1 | prometheus | - | 2 | "2022-11-03 07:18:44" | "/-/ready" | 200 | 192.15.2.1 | prometheus | - | 9 | "2022-11-03 07:18:54" | "/-/ready" | 200 | 192.15.2.1 | prometheus | - | 11 | "2022-11-03 07:18:64" | "/-/ready" | 200 | 192.15.2.1 | prometheus | - +--------+-----------------------+------------+------+------------+------------+ - -3. Average aggregation on a metric:: - - > source = my_prometheus.prometheus_http_requests_total | stats avg(@value) by span(@timestamp,15s) - +------------+------------------------+ - | avg(@value)| span(@timestamp,15s) | - |------------+------------------------+ - | 5 | "2022-11-03 07:18:14" | - | 3 | "2022-11-03 07:18:24" | - | 7 | "2022-11-03 07:18:34" | - | 2 | "2022-11-03 07:18:44" | - | 9 | "2022-11-03 07:18:54" | - | 11 | "2022-11-03 07:18:64" | - +------------+------------------------+ - -4. Average aggregation grouped by dimensions:: - - > source = my_prometheus.prometheus_http_requests_total | stats avg(@value) by span(@timestamp,15s), handler, code - +------------+------------------------+--------------------------------+---------------+ - | avg(@value)| span(@timestamp,15s) | handler | code | - |------------+------------------------+--------------------------------+---------------+ - | 5 | "2022-11-03 07:18:14" | "/-/ready" | 200 | - | 3 | "2022-11-03 07:18:24" | "/-/ready" | 200 | - | 7 | "2022-11-03 07:18:34" | "/-/ready" | 200 | - | 2 | "2022-11-03 07:18:44" | "/-/ready" | 400 | - | 9 | "2022-11-03 07:18:54" | "/-/promql" | 400 | - | 11 | "2022-11-03 07:18:64" | "/-/metrics" | 500 | - +------------+------------------------+--------------------------------+---------------+ - -5. Count aggregation query:: - - > source = my_prometheus.prometheus_http_requests_total | stats count() by span(@timestamp,15s), handler, code - +------------+------------------------+--------------------------------+---------------+ - | count() | span(@timestamp,15s) | handler | code | - |------------+------------------------+--------------------------------+---------------+ - | 5 | "2022-11-03 07:18:14" | "/-/ready" | 200 | - | 3 | "2022-11-03 07:18:24" | "/-/ready" | 200 | - | 7 | "2022-11-03 07:18:34" | "/-/ready" | 200 | - | 2 | "2022-11-03 07:18:44" | "/-/ready" | 400 | - | 9 | "2022-11-03 07:18:54" | "/-/promql" | 400 | - | 11 | "2022-11-03 07:18:64" | "/-/metrics" | 500 | - +------------+------------------------+--------------------------------+---------------+ - -PromQL Support for prometheus Connector -========================================== - -`query_range` Table Function ----------------------------- -* Prometheus connector offers `query_range` table function. This table function can be used to query metrics in a specific time range using promQL. -* The function takes inputs similar to parameters mentioned for query range api mentioned here: https://prometheus.io/docs/prometheus/latest/querying/api/ -* Arguments should be either passed by name or positionArguments should be either passed by name or position. - - `source=my_prometheus.query_range('prometheus_http_requests_total', 1686694425, 1686700130, 14)` - - `source=my_prometheus.query_range(query='prometheus_http_requests_total', starttime=1686694425, endtime=1686700130, step=14)` -Example:: - - > source=my_prometheus.query_range('prometheus_http_requests_total', 1686694425, 1686700130, 14) - +--------+-----------------------+--------------+------+------------+------------+ - | @value | @timestamp | handler | code | instance | job | - |--------+-----------------------+--------------+------+------------+------------| - | 5 | "2022-11-03 07:18:14" | "/-/ready" | 200 | 192.15.1.1 | prometheus | - | 3 | "2022-11-03 07:18:24" | "/-/ready" | 200 | 192.15.1.1 | prometheus | - | 7 | "2022-11-03 07:18:34" | "/-/ready" | 200 | 192.15.1.1 | prometheus | - | 2 | "2022-11-03 07:18:44" | "/-/ready" | 400 | 192.15.2.1 | prometheus | - | 9 | "2022-11-03 07:18:54" | "/-/promql" | 400 | 192.15.2.1 | prometheus | - | 11 | "2022-11-03 07:18:64" | "/-/metrics" | 500 | 192.15.2.1 | prometheus | - +--------+-----------------------+--------------+------+------------+------------+ - - -Prometheus Connector Table Functions -========================================== - -`query_exemplars` Table Function ----------------------------- -* This table function can be used to fetch exemplars of a query in a specific time range. -* The function takes inputs similar to parameters mentioned for query exemplars api mentioned here: https://prometheus.io/docs/prometheus/latest/querying/api/ -* Arguments should be either passed by name or positionArguments should be either passed by name or position. - - `source=my_prometheus.query_exemplars('prometheus_http_requests_total', 1686694425, 1686700130)` - - `source=my_prometheus.query_exemplars(query='prometheus_http_requests_total', starttime=1686694425, endtime=1686700130)` -Example:: - - > source=my_prometheus.query_exemplars('prometheus_http_requests_total', 1686694425, 1686700130) - "schema": [ - { - "name": "seriesLabels", - "type": "struct" - }, - { - "name": "exemplars", - "type": "array" - } - ], - "datarows": [ - [ - { - "instance": "localhost:8090", - "__name__": "test_exemplar_metric_total", - "service": "bar", - "job": "prometheus" - }, - [ - { - "labels": { - "traceID": "EpTxMJ40fUus7aGY" - }, - "timestamp": "2020-09-14 15:22:25.479", - "value": 6.0 - } - ] - ], - [ - { - "instance": "localhost:8090", - "__name__": "test_exemplar_metric_total", - "service": "foo", - "job": "prometheus" - }, - [ - { - "labels": { - "traceID": "Olp9XHlq763ccsfa" - }, - "timestamp": "2020-09-14 15:22:35.479", - "value": 19.0 - }, - { - "labels": { - "traceID": "hCtjygkIHwAN9vs4" - }, - "timestamp": "2020-09-14 15:22:45.489", - "value": 20.0 - } - ] - ] - ] diff --git a/docs/user/ppl/admin/connectors/s3glue_connector.md b/docs/user/ppl/admin/connectors/s3glue_connector.md new file mode 100644 index 00000000000..e05edbaa308 --- /dev/null +++ b/docs/user/ppl/admin/connectors/s3glue_connector.md @@ -0,0 +1,77 @@ +# S3Glue Connector + +## Introduction + +s3Glue connector provides a way to query s3 files using glue as metadata store and spark as execution engine. +This page covers s3Glue datasource configuration and also how to query and s3Glue datasource. +## Required resources for s3 Glue Connector + +* `EMRServerless Spark Execution Engine Config Setting`: Since we execute s3Glue queries on top of spark execution engine, we require this configuration. + + More details: [ExecutionEngine Config](../../../interfaces/asyncqueryinterface.md#id2) +* `S3`: This is where the data lies. +* `Glue` Metadata store: Glue takes care of table metadata. +* `Opensearch IndexStore`: Index for s3 data lies in opensearch and also acts as temporary buffer for query results. + +We currently only support emr-serverless as spark execution engine and Glue as metadata store. we will add more support in future. +Glue Connector Properties. +* `resultIndex` is a new parameter specific to glue connector. Stores the results of queries executed on the data source. If unavailable, it defaults to .query_execution_result. +* `glue.auth.type` [Required] + * This parameters provides the authentication type information required for execution engine to connect to glue. + * S3 Glue connector currently only supports `iam_role` authentication and the below parameters is required. + * `glue.auth.role_arn` +* `glue.indexstore.opensearch.*` [Required] + * This parameters provides the Opensearch domain host information for glue connector. This opensearch instance is used for writing index data back and also + * `glue.indexstore.opensearch.uri` [Required] + * `glue.indexstore.opensearch.auth` [Required] + * Accepted values include ["noauth", "basicauth", "awssigv4"] + * Basic Auth required `glue.indexstore.opensearch.auth.username` and `glue.indexstore.opensearch.auth.password` + * AWSSigV4 Auth requires `glue.indexstore.opensearch.auth.region` and `glue.auth.role_arn` + * `glue.indexstore.opensearch.region` [Required for awssigv4 auth] +* `glue.iceberg.enabled` determines whether to enable Iceberg for the session. Default value is `"false"` if not specified. +* `glue.lakeformation.enabled` determines whether to enable Lake Formation for queries when Iceberg is also enabled. If Iceberg is not enabled, then this property has no effect. Default value is `"false"` if not specified. +* `glue.lakeformation.session_tag` what session tag to use when assuming the data source role. This property is required when both Iceberg and Lake Formation are enabled. + +## Sample Glue dataSource configuration + +Glue datasource configuration + +```bash +[{ + "name" : "my_glue", + "connector": "s3glue", + "properties" : { + "glue.auth.type": "iam_role", + "glue.auth.role_arn": "role_arn", + "glue.indexstore.opensearch.uri": "http://localhost:9200", + "glue.indexstore.opensearch.auth" :"basicauth", + "glue.indexstore.opensearch.auth.username" :"username", + "glue.indexstore.opensearch.auth.password" :"password" + }, + "resultIndex": "query_execution_result" +}] + +[{ + "name" : "my_glue", + "connector": "s3glue", + "properties" : { + "glue.auth.type": "iam_role", + "glue.auth.role_arn": "role_arn", + "glue.indexstore.opensearch.uri": "http://adsasdf.amazonopensearch.com:9200", + "glue.indexstore.opensearch.auth" :"awssigv4", + "glue.indexstore.opensearch.auth.region" :"us-east-1" + }, + "resultIndex": "query_execution_result" +}] + +``` + +## Sample s3Glue datasource queries APIS + +Sample Queries +* Select Query : `select * from mys3.default.http_logs limit 1"` +* Create Covering Index Query: `create index clientip_year on my_glue.default.http_logs (clientip, year) WITH (auto_refresh=true)` +* Create Skipping Index: `create skipping index on mys3.default.http_logs (status VALUE_SET)` + +These queries would work only top of async queries. Documentation: [Async Query APIs](../../../interfaces/asyncqueryinterface.md) +Documentation for Index Queries: https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md \ No newline at end of file diff --git a/docs/user/ppl/admin/connectors/s3glue_connector.rst b/docs/user/ppl/admin/connectors/s3glue_connector.rst deleted file mode 100644 index 48f19a9d1e5..00000000000 --- a/docs/user/ppl/admin/connectors/s3glue_connector.rst +++ /dev/null @@ -1,92 +0,0 @@ -.. highlight:: sh - -==================== -S3Glue Connector -==================== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 1 - - -Introduction -============ - -s3Glue connector provides a way to query s3 files using glue as metadata store and spark as execution engine. -This page covers s3Glue datasource configuration and also how to query and s3Glue datasource. - -Required resources for s3 Glue Connector -======================================== -* ``EMRServerless Spark Execution Engine Config Setting``: Since we execute s3Glue queries on top of spark execution engine, we require this configuration. - More details: `ExecutionEngine Config <../../../interfaces/asyncqueryinterface.rst#id2>`_ -* ``S3``: This is where the data lies. -* ``Glue`` Metadata store: Glue takes care of table metadata. -* ``Opensearch IndexStore``: Index for s3 data lies in opensearch and also acts as temporary buffer for query results. - -We currently only support emr-serverless as spark execution engine and Glue as metadata store. we will add more support in future. - -Glue Connector Properties. - -* ``resultIndex`` is a new parameter specific to glue connector. Stores the results of queries executed on the data source. If unavailable, it defaults to .query_execution_result. -* ``glue.auth.type`` [Required] - * This parameters provides the authentication type information required for execution engine to connect to glue. - * S3 Glue connector currently only supports ``iam_role`` authentication and the below parameters is required. - * ``glue.auth.role_arn`` -* ``glue.indexstore.opensearch.*`` [Required] - * This parameters provides the Opensearch domain host information for glue connector. This opensearch instance is used for writing index data back and also - * ``glue.indexstore.opensearch.uri`` [Required] - * ``glue.indexstore.opensearch.auth`` [Required] - * Accepted values include ["noauth", "basicauth", "awssigv4"] - * Basic Auth required ``glue.indexstore.opensearch.auth.username`` and ``glue.indexstore.opensearch.auth.password`` - * AWSSigV4 Auth requires ``glue.indexstore.opensearch.auth.region`` and ``glue.auth.role_arn`` - * ``glue.indexstore.opensearch.region`` [Required for awssigv4 auth] -* ``glue.iceberg.enabled`` determines whether to enable Iceberg for the session. Default value is ``"false"`` if not specified. -* ``glue.lakeformation.enabled`` determines whether to enable Lake Formation for queries when Iceberg is also enabled. If Iceberg is not enabled, then this property has no effect. Default value is ``"false"`` if not specified. -* ``glue.lakeformation.session_tag`` what session tag to use when assuming the data source role. This property is required when both Iceberg and Lake Formation are enabled. - -Sample Glue dataSource configuration -======================================== - -Glue datasource configuration:: - - [{ - "name" : "my_glue", - "connector": "s3glue", - "properties" : { - "glue.auth.type": "iam_role", - "glue.auth.role_arn": "role_arn", - "glue.indexstore.opensearch.uri": "http://localhost:9200", - "glue.indexstore.opensearch.auth" :"basicauth", - "glue.indexstore.opensearch.auth.username" :"username", - "glue.indexstore.opensearch.auth.password" :"password" - }, - "resultIndex": "query_execution_result" - }] - - [{ - "name" : "my_glue", - "connector": "s3glue", - "properties" : { - "glue.auth.type": "iam_role", - "glue.auth.role_arn": "role_arn", - "glue.indexstore.opensearch.uri": "http://adsasdf.amazonopensearch.com:9200", - "glue.indexstore.opensearch.auth" :"awssigv4", - "glue.indexstore.opensearch.auth.region" :"us-east-1" - }, - "resultIndex": "query_execution_result" - }] - -Sample s3Glue datasource queries APIS -===================================== - -Sample Queries - -* Select Query : ``select * from mys3.default.http_logs limit 1"`` -* Create Covering Index Query: ``create index clientip_year on my_glue.default.http_logs (clientip, year) WITH (auto_refresh=true)`` -* Create Skipping Index: ``create skipping index on mys3.default.http_logs (status VALUE_SET)`` - -These queries would work only top of async queries. Documentation: `Async Query APIs <../../../interfaces/asyncqueryinterface.rst>`_ - -Documentation for Index Queries: https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md diff --git a/docs/user/ppl/admin/connectors/security_lake_connector.md b/docs/user/ppl/admin/connectors/security_lake_connector.md new file mode 100644 index 00000000000..a9b27cf7e21 --- /dev/null +++ b/docs/user/ppl/admin/connectors/security_lake_connector.md @@ -0,0 +1,63 @@ +# Security Lake Connector + +## Introduction + +Security Lake connector provides a way to query Security Lake tables. +## Required resources for Security Lake Connector + +* `EMRServerless Spark Execution Engine Config Setting`: Since we execute s3Glue queries on top of spark execution engine, we require this configuration. + + More details: [ExecutionEngine Config](../../../interfaces/asyncqueryinterface.md#id2) +* `S3`: This is where the data lies. +* `Glue`: Metadata store: Glue takes care of table metadata. +* `Lake Formation`: AWS service that performs authorization on Security Lake tables +* `Security Lake`: AWS service that orchestrates creation of S3 files, Glue tables, and Lake Formation permissions. +* `Opensearch IndexStore`: Index for s3 data lies in opensearch and also acts as temporary buffer for query results. + +We currently only support emr-serverless as spark execution engine and Glue as metadata store. we will add more support in future. +Glue Connector Properties. +* `resultIndex` is a new parameter specific to glue connector. Stores the results of queries executed on the data source. If unavailable, it defaults to .query_execution_result. +* `glue.auth.type` [Required] + * This parameters provides the authentication type information required for execution engine to connect to glue. + * S3 Glue connector currently only supports `iam_role` authentication and the below parameters is required. + * `glue.auth.role_arn` +* `glue.indexstore.opensearch.*` [Required] + * This parameters provides the Opensearch domain host information for glue connector. This opensearch instance is used for writing index data back and also + * `glue.indexstore.opensearch.uri` [Required] + * `glue.indexstore.opensearch.auth` [Required] + * Accepted values include ["noauth", "basicauth", "awssigv4"] + * Basic Auth required `glue.indexstore.opensearch.auth.username` and `glue.indexstore.opensearch.auth.password` + * AWSSigV4 Auth requires `glue.indexstore.opensearch.auth.region` and `glue.auth.role_arn` + * `glue.indexstore.opensearch.region` [Required for awssigv4 auth] +* `glue.lakeformation.session_tag` [Required] + * What session tag to use when assuming the data source role. + +## Sample Glue dataSource configuration + +Glue datasource configuration + +```bash +[{ + "name" : "my_sl", + "connector": "security_lake", + "properties" : { + "glue.auth.type": "iam_role", + "glue.auth.role_arn": "role_arn", + "glue.indexstore.opensearch.uri": "http://adsasdf.amazonopensearch.com:9200", + "glue.indexstore.opensearch.auth" :"awssigv4", + "glue.indexstore.opensearch.auth.region" :"us-east-1", + "glue.lakeformation.session_tag": "sesson_tag" + }, + "resultIndex": "query_execution_result" +}] + +``` + +## Sample Security Lake datasource queries APIS + +Sample Queries +* Select Query : `select * from mysl.amazon_security_lake_glue_db_eu_west_1.amazon_security_lake_table_eu_west_1_vpc_flow_2_0 limit 1` +* Create Covering Index Query: `create index srcip_time on mysl.amazon_security_lake_glue_db_eu_west_1.amazon_security_lake_table_eu_west_1_vpc_flow_2_0 (src_endpoint.ip, time) WITH (auto_refresh=true)` + +These queries would work only top of async queries. Documentation: [Async Query APIs](../../../interfaces/asyncqueryinterface.md) +Documentation for Index Queries: https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md \ No newline at end of file diff --git a/docs/user/ppl/admin/connectors/security_lake_connector.rst b/docs/user/ppl/admin/connectors/security_lake_connector.rst deleted file mode 100644 index 6afddca1319..00000000000 --- a/docs/user/ppl/admin/connectors/security_lake_connector.rst +++ /dev/null @@ -1,78 +0,0 @@ -.. highlight:: sh - -==================== -Security Lake Connector -==================== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 1 - - -Introduction -============ - -Security Lake connector provides a way to query Security Lake tables. - -Required resources for Security Lake Connector -======================================== -* ``EMRServerless Spark Execution Engine Config Setting``: Since we execute s3Glue queries on top of spark execution engine, we require this configuration. - More details: `ExecutionEngine Config <../../../interfaces/asyncqueryinterface.rst#id2>`_ -* ``S3``: This is where the data lies. -* ``Glue``: Metadata store: Glue takes care of table metadata. -* ``Lake Formation``: AWS service that performs authorization on Security Lake tables -* ``Security Lake``: AWS service that orchestrates creation of S3 files, Glue tables, and Lake Formation permissions. -* ``Opensearch IndexStore``: Index for s3 data lies in opensearch and also acts as temporary buffer for query results. - -We currently only support emr-serverless as spark execution engine and Glue as metadata store. we will add more support in future. - -Glue Connector Properties. - -* ``resultIndex`` is a new parameter specific to glue connector. Stores the results of queries executed on the data source. If unavailable, it defaults to .query_execution_result. -* ``glue.auth.type`` [Required] - * This parameters provides the authentication type information required for execution engine to connect to glue. - * S3 Glue connector currently only supports ``iam_role`` authentication and the below parameters is required. - * ``glue.auth.role_arn`` -* ``glue.indexstore.opensearch.*`` [Required] - * This parameters provides the Opensearch domain host information for glue connector. This opensearch instance is used for writing index data back and also - * ``glue.indexstore.opensearch.uri`` [Required] - * ``glue.indexstore.opensearch.auth`` [Required] - * Accepted values include ["noauth", "basicauth", "awssigv4"] - * Basic Auth required ``glue.indexstore.opensearch.auth.username`` and ``glue.indexstore.opensearch.auth.password`` - * AWSSigV4 Auth requires ``glue.indexstore.opensearch.auth.region`` and ``glue.auth.role_arn`` - * ``glue.indexstore.opensearch.region`` [Required for awssigv4 auth] -* ``glue.lakeformation.session_tag`` [Required] - * What session tag to use when assuming the data source role. - -Sample Glue dataSource configuration -======================================== - -Glue datasource configuration:: - - [{ - "name" : "my_sl", - "connector": "security_lake", - "properties" : { - "glue.auth.type": "iam_role", - "glue.auth.role_arn": "role_arn", - "glue.indexstore.opensearch.uri": "http://adsasdf.amazonopensearch.com:9200", - "glue.indexstore.opensearch.auth" :"awssigv4", - "glue.indexstore.opensearch.auth.region" :"us-east-1", - "glue.lakeformation.session_tag": "sesson_tag" - }, - "resultIndex": "query_execution_result" - }] - -Sample Security Lake datasource queries APIS -===================================== - -Sample Queries - -* Select Query : ``select * from mysl.amazon_security_lake_glue_db_eu_west_1.amazon_security_lake_table_eu_west_1_vpc_flow_2_0 limit 1`` -* Create Covering Index Query: ``create index srcip_time on mysl.amazon_security_lake_glue_db_eu_west_1.amazon_security_lake_table_eu_west_1_vpc_flow_2_0 (src_endpoint.ip, time) WITH (auto_refresh=true)`` - -These queries would work only top of async queries. Documentation: `Async Query APIs <../../../interfaces/asyncqueryinterface.rst>`_ - -Documentation for Index Queries: https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md diff --git a/docs/user/ppl/admin/cross_cluster_search.md b/docs/user/ppl/admin/cross_cluster_search.md new file mode 100644 index 00000000000..4acdd41e354 --- /dev/null +++ b/docs/user/ppl/admin/cross_cluster_search.md @@ -0,0 +1,89 @@ +# Cross-Cluster Search + +## Introduction + +Cross-cluster search lets any node in a cluster execute search requests against other clusters. +It makes searching easy across all connected clusters, allowing users to use multiple smaller clusters instead of a single large one. +## Configuration + +On the local cluster, add the remote cluster name and the IP address with port 9300 for each seed node. + +```bash +PUT _cluster/settings +{ + "persistent": { + "cluster.remote": { + "": { + "seeds": [":9300"] + } + } + } +} +``` + +## Using Cross-Cluster Search in PPL + +Perform cross-cluster search by using "\:\" as the index identifier. +Example PPL query + +```ppl +source=my_remote_cluster:accounts +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+ +| account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | +|----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------| +| 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | +| 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | +| 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates | +| 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | ++----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+ +``` + +## Limitations + +Since OpenSearch does not support cross-cluster index metadata retrieval, field mapping of a remote cluster index is not available to the local cluster. +([[Feature] Cross-cluster field mappings query #6573](https://github.com/opensearch-project/OpenSearch/issues/6573)) +Therefore, the query engine requires that for any remote cluster index that the users need to search, +the local cluster keep a field mapping system index with the same index name. +This can be done by creating an index on the local cluster with the same name and schema as the remote cluster index. +## Authentication and Permission + +1. The security plugin authenticates the user on the local cluster. +2. The security plugin fetches the user’s backend roles on the local cluster. +3. The call, including the authenticated user, is forwarded to the remote cluster. +4. The user’s permissions are evaluated on the remote cluster. + +Check [Cross-cluster search access control](https://opensearch.org/docs/latest/security/access-control/cross-cluster-search/) for more details. +Example: Create the ppl_role for test_user on local cluster and the ccs_role for test_user on remote cluster. Then test_user could use PPL to query `ppl-security-demo` index on remote cluster. +1. On the local cluster, refer to [Security Settings](security.md) to create role and user for PPL plugin and index access permission. +2. On the remote cluster, create a new role and grant permission to access index. Create a user with the same name and credentials as the local cluster, and map the user to this role + +```bash +PUT _plugins/_security/api/roles/ccs_role +{ + "index_permissions":[ + { + "index_patterns":["ppl-security-demo"], + "allowed_actions":[ + "indices:admin/shards/search_shards", + "indices:data/read/search" + ] + } + ] +} +``` + +```bash +PUT _plugins/_security/api/rolesmapping/ccs_role +{ + "backend_roles" : [], + "hosts" : [], + "users" : ["test_user"] +} +``` + \ No newline at end of file diff --git a/docs/user/ppl/admin/cross_cluster_search.rst b/docs/user/ppl/admin/cross_cluster_search.rst deleted file mode 100644 index a94a0dce67e..00000000000 --- a/docs/user/ppl/admin/cross_cluster_search.rst +++ /dev/null @@ -1,96 +0,0 @@ -.. highlight:: sh - -==================== -Cross-Cluster Search -==================== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 1 - -Introduction -============ -Cross-cluster search lets any node in a cluster execute search requests against other clusters. -It makes searching easy across all connected clusters, allowing users to use multiple smaller clusters instead of a single large one. - - -Configuration -============= -On the local cluster, add the remote cluster name and the IP address with port 9300 for each seed node. :: - - PUT _cluster/settings - { - "persistent": { - "cluster.remote": { - "": { - "seeds": [":9300"] - } - } - } - } - - -Using Cross-Cluster Search in PPL -================================= -Perform cross-cluster search by using ":" as the index identifier. - -Example PPL query:: - - os> source=my_remote_cluster:accounts; - fetched rows / total rows = 4/4 - +----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+ - | account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | - |----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------| - | 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | - | 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | - | 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates | - | 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | - +----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+ - - -Limitations -=========== -Since OpenSearch does not support cross cluster index metadata retrieval, field mapping of a remote cluster index is not available to the local cluster. -(`[Feature] Cross cluster field mappings query #6573 `_) -Therefore, the query engine requires that for any remote cluster index that the users need to search, -the local cluster keep a field mapping system index with the same index name. -This can be done by creating an index on the local cluster with the same name and schema as the remote cluster index. - - -Authentication and Permission -============================= - -1. The security plugin authenticates the user on the local cluster. -2. The security plugin fetches the user’s backend roles on the local cluster. -3. The call, including the authenticated user, is forwarded to the remote cluster. -4. The user’s permissions are evaluated on the remote cluster. - -Check `Cross-cluster search access control `_ for more details. - -Example: Create the ppl_role for test_user on local cluster and the ccs_role for test_user on remote cluster. Then test_user could use PPL to query ``ppl-security-demo`` index on remote cluster. - -1. On the local cluster, refer to `Security Settings `_ to create role and user for PPL plugin and index access permission. - -2. On the remote cluster, create a new role and grant permission to access index. Create a user with the same name and credentials as the local cluster, and map the user to this role:: - - PUT _plugins/_security/api/roles/ccs_role - { - "index_permissions":[ - { - "index_patterns":["ppl-security-demo"], - "allowed_actions":[ - "indices:admin/shards/search_shards", - "indices:data/read/search" - ] - } - ] - } - - PUT _plugins/_security/api/rolesmapping/ccs_role - { - "backend_roles" : [], - "hosts" : [], - "users" : ["test_user"] - } diff --git a/docs/user/ppl/admin/datasources.md b/docs/user/ppl/admin/datasources.md new file mode 100644 index 00000000000..5d89b0eda94 --- /dev/null +++ b/docs/user/ppl/admin/datasources.md @@ -0,0 +1,304 @@ +# Datasource Settings + +## Introduction + +The concept of `datasource` is introduced to support the federation of SQL/PPL query engine to multiple data stores. +This helps PPL users to leverage data from multiple data stores and derive correlation and insights. +Datasource definition provides the information to connect to a data store and also gives a name to them to refer in PPL commands. +Refer below sections for quick setup. +* [Datasource configuration APIs](#datasource-configuration-apis) +* [Master Key config for encrypting credential information](#master-key-config-for-encrypting-credential-information) + +## Definitions of datasource and connector + +* Connector is a component that adapts the query engine to a datastore. For example, Prometheus connector would adapt and help execute the queries to run on Prometheus datastore. connector name is enough in the datasource definition json. +* Datasource is a construct to define how to connect to a data store and which connector to adapt by query engine. + +Example Prometheus Datasource Definition + +```bash +{ + "name" : "my_prometheus", + "connector": "prometheus", + "properties" : { + "prometheus.uri" : "http://localhost:8080", + "prometheus.auth.type" : "basicauth", + "prometheus.auth.username" : "admin", + "prometheus.auth.password" : "admin" + }, + "allowedRoles" : ["prometheus_access"], + "status" : "ACTIVE|DISABLED" +} +``` + +Datasource configuration Restrictions. +* `name`, `connector`, `properties` are required fields in the datasource configuration. +* In case of secure domains, `allowedRoles` can be used to specify the opensearch roles allowed to access the datasource via PPL/SQL. +* If `allowedRoles` are not specified for a datasource, only users with `all_access` could access the datasource in case of secure domains. +* In case of security disabled domains, authorization is disbaled. +* All the datasource names should be unique and match the following regex[`[@*A-Za-z]+?[*a-zA-Z_\-0-9]*`]. +* Allowed Connectors. + * `prometheus` [More details: [Prometheus Connector](connectors/prometheus_connector.md)] +* All the allowed config parameters in `properties` are defined in individual connector pages mentioned above. +* From version 2.13, we have introduced a new optional field `status` which can be used to enable and disable a datasource.When a datasource is disabled, it blocks new queries, resulting in 400 errors for any attempts made on it. By default when a datasource is created, status is ACTIVE. + +## Datasource configuration APIs + +Datasource configuration can be managed using below REST APIs. All the examples below are for OpenSearch domains enabled with secure domain. +we can remove authorization and other details in case of security disabled domains. +* Datasource Creation POST API ("_plugins/_query/_datasources") + +```bash +POST https://localhost:9200/_plugins/_query/_datasources +content-type: application/json +Authorization: Basic {{username}} {{password}} + +{ + "name" : "my_prometheus", + "connector": "prometheus", + "properties" : { + "prometheus.uri" : "http://localhost:8080", + "prometheus.auth.type" : "basicauth", + "prometheus.auth.username" : "admin", + "prometheus.auth.password" : "admin" + }, + "allowedRoles" : ["prometheus_access"] +} + +``` + +* Datasource modification PUT API ("_plugins/_query/_datasources") + +```bash +PUT https://localhost:9200/_plugins/_query/_datasources +content-type: application/json +Authorization: Basic {{username}} {{password}} + +{ + "name" : "my_prometheus", + "connector": "prometheus", + "properties" : { + "prometheus.uri" : "http://localhost:8080", + "prometheus.auth.type" : "basicauth", + "prometheus.auth.username" : "admin", + "prometheus.auth.password" : "admin" + }, + "allowedRoles" : ["prometheus_access"] +} + +``` + +* Datasource modification PATCH API ("_plugins/_query/_datasources") + +```bash +PATCH https://localhost:9200/_plugins/_query/_datasources +content-type: application/json +Authorization: Basic {{username}} {{password}} + +{ + "name" : "my_prometheus", + "allowedRoles" : ["all_access"] +} +``` +**Name is required and must exist. Connector cannot be modified and will be ignored.** + + +* Datasource Read GET API("_plugins/_query/_datasources/{{dataSourceName}}" + +```bash +GET https://localhost:9200/_plugins/_query/_datasources/my_prometheus +content-type: application/json +Authorization: Basic {{username}} {{password}} +``` +**Authentication Information won't be vended out in GET API's response.** + +* Datasource Deletion DELETE API("_plugins/_query/_datasources/{{dataSourceName}}") + +```bash +DELETE https://localhost:9200/_plugins/_query/_datasources/my_prometheus +content-type: application/json +Authorization: Basic {{username}} {{password}} + +``` + +## Authorization of datasource configuration APIs + +Each of the datasource configuration management apis are controlled by following actions respectively. +* cluster:admin/opensearch/datasources/create [Create POST API] +* cluster:admin/opensearch/datasources/read [Get GET API] +* cluster:admin/opensearch/datasources/update [Update PUT API] +* cluster:admin/opensearch/datasources/patch [Update PATCH API] +* cluster:admin/opensearch/datasources/delete [Delete DELETE API] + +Only users mapped with roles having above actions are authorized to execute datasource management apis. +## Master Key config for encrypting credential information + +* When users provide credentials for a data source, the system encrypts and securely stores them in the metadata index. System uses "AES/GCM/NoPadding" symmetric encryption algorithm. +* Master key is a required config and users can set this up by configuring the `plugins.query.datasources.encryption.masterkey` setting in the opensearch.yml file. +* The master key must be 16, 24, or 32 characters long. +* Sample Bash Script to generate a 24 character master key + +```bash +#!/bin/bash +# Generate a 24-character key +master_key=$(openssl rand -hex 12) +echo "Master Key: $master_key" +``` + +* Sample python script to generate a 24 character master key + +```bash +import random +import string + +# Generate a 24-character random master key +master_key = ''.join(random.choices(string.ascii_letters + string.digits, k=24)) + +# Print the master key +print("Generated master key:", master_key) + +``` + +## Datasource URI Hosts Deny Lists Config + +* In the OpenSearch configuration file (opensearch.yml), the parameter "plugins.query.datasources.uri.hosts.denylist" can be utilized to control the permitted host ips within the datasource URI configuration. +* By default, the value is set to empty list, which allows any domain to be accepted. +* For instance, if you set the value to `127.0.0.0/8`, ppl plugins will deny all the query requests where the datasource URI resolves to the ip range from `127.0.0.0` to `127.255.255.255` + +## Using a datasource in PPL command + +Datasource is referred in source command as show in the code block below. +Based on the abstraction designed by the connector, +one can refer the corresponding entity as table in the source command. +For example in prometheus connector, each metric is abstracted as a table. +so we can refer a metric and apply stats over it in the following way. +Example source command with prometheus datasource + +```ppl ignore +source = my_prometheus.prometheus_http_requests_total | stats avg(@value) by job; +``` + +## Authorization of PPL commands on datasources + +In case of secure opensearch domains, only admins and users with roles mentioned in datasource configuration are allowed to make queries. +For example: with below datasource configuration, only admins and users with prometheus_access role can run queries on my_prometheus datasource. + +```bash +{ + "name" : "my_prometheus", + "connector": "prometheus", + "properties" : { + "prometheus.uri" : "http://localhost:8080", + "prometheus.auth.type" : "basicauth", + "prometheus.auth.username" : "admin", + "prometheus.auth.password" : "admin" + }, + "allowedRoles" : ["prometheus_access"] +} +``` + +## Moving from keystore datasource configuration + +* In versions prior to 2.7, the plugins.query.federation.datasources.config key store setting was used to configure datasources, but it has been deprecated and will be removed in version 3.0. +* To port previously configured datasources from the keystore, users can use the `create datasource` REST API mentioned in the above section. + +## Disabling a datasource to block new queries + +* We can disable a datasource using PATCH or PUT API. Below is the example request for disabling a datasource named "my_prometheus" using PATCH API. + +```bash +PATCH https://localhost:9200/_plugins/_query/_datasources +content-type: application/json +Authorization: Basic {{username}} {{password}} + +{ + "name" : "my_prometheus", + "status" : "disabled" +} + + +``` + +## Metadata queries using information_schema + +Use `information_schema` in source command to query tables information under a datasource. +In the current state, `information_schema` only support metadata of tables. +This schema will be extended for views, columns and other metadata info in future. +### Syntax + +source = datasource.information_schema.tables; +### Example 1: Fetch tables in prometheus datasource + +The examples fetches tables in the prometheus datasource. +PPL query for fetching PROMETHEUS TABLES with where clause + +```ppl +source = my_prometheus.information_schema.tables +| where TABLE_NAME='prometheus_http_requests_total' +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------+--------------+--------------------------------+------------+------+---------------------------+ +| TABLE_CATALOG | TABLE_SCHEMA | TABLE_NAME | TABLE_TYPE | UNIT | REMARKS | +|---------------+--------------+--------------------------------+------------+------+---------------------------| +| my_prometheus | default | prometheus_http_requests_total | counter | | Counter of HTTP requests. | ++---------------+--------------+--------------------------------+------------+------+---------------------------+ +``` + +### Example 2: Search tables in prometheus datasource + +The examples searches tables in the prometheus datasource. +PPL query for searching PROMETHEUS TABLES + +```ppl +source = my_prometheus.information_schema.tables +| where LIKE(TABLE_NAME, "%http%") +``` + +Expected output: + +```text +fetched rows / total rows = 6/6 + +---------------+--------------+--------------------------------------------+------------+------+----------------------------------------------------+ +| TABLE_CATALOG | TABLE_SCHEMA | TABLE_NAME | TABLE_TYPE | UNIT | REMARKS | +|---------------+--------------+--------------------------------------------+------------+------+----------------------------------------------------| +| my_prometheus | default | prometheus_http_requests_total | counter | | Counter of HTTP requests. | +| my_prometheus | default | promhttp_metric_handler_requests_in_flight | gauge | | Current number of scrapes being served. | +| my_prometheus | default | prometheus_http_request_duration_seconds | histogram | | Histogram of latencies for HTTP requests. | +| my_prometheus | default | prometheus_sd_http_failures_total | counter | | Number of HTTP service discovery refresh failures. | +| my_prometheus | default | promhttp_metric_handler_requests_total | counter | | Total number of scrapes by HTTP status code. | +| my_prometheus | default | prometheus_http_response_size_bytes | histogram | | Histogram of response size for HTTP requests. | ++---------------+--------------+--------------------------------------------+------------+------+----------------------------------------------------+ +``` + +## Fetch metadata for table in Prometheus datasource + +After a Prometheus datasource is configured, you can inspect the schema of any metric by running the `describe` command against the fully qualified table name. For example + +```ppl +describe my_prometheus.prometheus_http_requests_total +``` + +Expected output: + +```text +fetched rows / total rows = 6/6 ++---------------+--------------+--------------------------------+-------------+-----------+ +| TABLE_CATALOG | TABLE_SCHEMA | TABLE_NAME | COLUMN_NAME | DATA_TYPE | +|---------------+--------------+--------------------------------+-------------+-----------| +| my_prometheus | default | prometheus_http_requests_total | handler | string | +| my_prometheus | default | prometheus_http_requests_total | code | string | +| my_prometheus | default | prometheus_http_requests_total | instance | string | +| my_prometheus | default | prometheus_http_requests_total | @timestamp | timestamp | +| my_prometheus | default | prometheus_http_requests_total | @value | double | +| my_prometheus | default | prometheus_http_requests_total | job | string | ++---------------+--------------+--------------------------------+-------------+-----------+ +``` + +## Limitations + +In using PPL, data sources except OpenSearch can only work with `plugins.calcite.enabled=false`. +When Calcite is enabled, queries against non-OpenSearch data sources will implicit fallback to v2, which means new PPL commands/functions introduced in 3.0.0 and above cannot work together with non-OpenSearch data sources. \ No newline at end of file diff --git a/docs/user/ppl/admin/datasources.rst b/docs/user/ppl/admin/datasources.rst deleted file mode 100644 index c5f9adfd85a..00000000000 --- a/docs/user/ppl/admin/datasources.rst +++ /dev/null @@ -1,290 +0,0 @@ -.. highlight:: sh - -=================== -Datasource Settings -=================== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 1 - -Introduction -============ - -The concept of ``datasource`` is introduced to support the federation of SQL/PPL query engine to multiple data stores. -This helps PPL users to leverage data from multiple data stores and derive correlation and insights. -Datasource definition provides the information to connect to a data store and also gives a name to them to refer in PPL commands. - -Refer below sections for quick setup. - -* `Datasource configuration APIs`_ -* `Master Key config for encrypting credential information`_ - - -Definitions of datasource and connector -======================================= -* Connector is a component that adapts the query engine to a datastore. For example, Prometheus connector would adapt and help execute the queries to run on Prometheus datastore. connector name is enough in the datasource definition json. -* Datasource is a construct to define how to connect to a data store and which connector to adapt by query engine. - -Example Prometheus Datasource Definition :: - - { - "name" : "my_prometheus", - "connector": "prometheus", - "properties" : { - "prometheus.uri" : "http://localhost:8080", - "prometheus.auth.type" : "basicauth", - "prometheus.auth.username" : "admin", - "prometheus.auth.password" : "admin" - }, - "allowedRoles" : ["prometheus_access"], - "status" : "ACTIVE|DISABLED" - } -Datasource configuration Restrictions. - -* ``name``, ``connector``, ``properties`` are required fields in the datasource configuration. -* In case of secure domains, ``allowedRoles`` can be used to specify the opensearch roles allowed to access the datasource via PPL/SQL. -* If ``allowedRoles`` are not specified for a datasource, only users with ``all_access`` could access the datasource in case of secure domains. -* In case of security disabled domains, authorization is disbaled. -* All the datasource names should be unique and match the following regex[``[@*A-Za-z]+?[*a-zA-Z_\-0-9]*``]. -* Allowed Connectors. - * ``prometheus`` [More details: `Prometheus Connector `_] -* All the allowed config parameters in ``properties`` are defined in individual connector pages mentioned above. -* From version 2.13, we have introduced a new optional field ``status`` which can be used to enable and disable a datasource.When a datasource is disabled, it blocks new queries, resulting in 400 errors for any attempts made on it. By default when a datasource is created, status is ACTIVE. - - -Datasource configuration APIs -============================= -Datasource configuration can be managed using below REST APIs. All the examples below are for OpenSearch domains enabled with secure domain. -we can remove authorization and other details in case of security disabled domains. - -* Datasource Creation POST API ("_plugins/_query/_datasources") :: - - POST https://localhost:9200/_plugins/_query/_datasources - content-type: application/json - Authorization: Basic {{username}} {{password}} - - { - "name" : "my_prometheus", - "connector": "prometheus", - "properties" : { - "prometheus.uri" : "http://localhost:8080", - "prometheus.auth.type" : "basicauth", - "prometheus.auth.username" : "admin", - "prometheus.auth.password" : "admin" - }, - "allowedRoles" : ["prometheus_access"] - } - -* Datasource modification PUT API ("_plugins/_query/_datasources") :: - - PUT https://localhost:9200/_plugins/_query/_datasources - content-type: application/json - Authorization: Basic {{username}} {{password}} - - { - "name" : "my_prometheus", - "connector": "prometheus", - "properties" : { - "prometheus.uri" : "http://localhost:8080", - "prometheus.auth.type" : "basicauth", - "prometheus.auth.username" : "admin", - "prometheus.auth.password" : "admin" - }, - "allowedRoles" : ["prometheus_access"] - } - -* Datasource modification PATCH API ("_plugins/_query/_datasources") :: - - PATCH https://localhost:9200/_plugins/_query/_datasources - content-type: application/json - Authorization: Basic {{username}} {{password}} - - { - "name" : "my_prometheus", - "allowedRoles" : ["all_access"] - } - - **Name is required and must exist. Connector cannot be modified and will be ignored.** - -* Datasource Read GET API("_plugins/_query/_datasources/{{dataSourceName}}" :: - - GET https://localhost:9200/_plugins/_query/_datasources/my_prometheus - content-type: application/json - Authorization: Basic {{username}} {{password}} - - **Authentication Information won't be vended out in GET API's response.** - -* Datasource Deletion DELETE API("_plugins/_query/_datasources/{{dataSourceName}}") :: - - DELETE https://localhost:9200/_plugins/_query/_datasources/my_prometheus - content-type: application/json - Authorization: Basic {{username}} {{password}} - -Authorization of datasource configuration APIs -============================================== -Each of the datasource configuration management apis are controlled by following actions respectively. - -* cluster:admin/opensearch/datasources/create [Create POST API] -* cluster:admin/opensearch/datasources/read [Get GET API] -* cluster:admin/opensearch/datasources/update [Update PUT API] -* cluster:admin/opensearch/datasources/patch [Update PATCH API] -* cluster:admin/opensearch/datasources/delete [Delete DELETE API] - -Only users mapped with roles having above actions are authorized to execute datasource management apis. - -Master Key config for encrypting credential information -======================================================== -* When users provide credentials for a data source, the system encrypts and securely stores them in the metadata index. System uses "AES/GCM/NoPadding" symmetric encryption algorithm. -* Master key is a required config and users can set this up by configuring the `plugins.query.datasources.encryption.masterkey` setting in the opensearch.yml file. -* The master key must be 16, 24, or 32 characters long. -* Sample Bash Script to generate a 24 character master key :: - - #!/bin/bash - # Generate a 24-character key - master_key=$(openssl rand -hex 12) - echo "Master Key: $master_key" -* Sample python script to generate a 24 character master key :: - - import random - import string - - # Generate a 24-character random master key - master_key = ''.join(random.choices(string.ascii_letters + string.digits, k=24)) - - # Print the master key - print("Generated master key:", master_key) - -Datasource URI Hosts Deny Lists Config -====================================== -* In the OpenSearch configuration file (opensearch.yml), the parameter "plugins.query.datasources.uri.hosts.denylist" can be utilized to control the permitted host ips within the datasource URI configuration. -* By default, the value is set to empty list, which allows any domain to be accepted. -* For instance, if you set the value to `127.0.0.0/8`, ppl plugins will deny all the query requests where the datasource URI resolves to the ip range from `127.0.0.0` to `127.255.255.255` - - -Using a datasource in PPL command -================================= -Datasource is referred in source command as show in the code block below. -Based on the abstraction designed by the connector, -one can refer the corresponding entity as table in the source command. -For example in prometheus connector, each metric is abstracted as a table. -so we can refer a metric and apply stats over it in the following way. - -Example source command with prometheus datasource :: - - >> source = my_prometheus.prometheus_http_requests_total | stats avg(@value) by job; - - -Authorization of PPL commands on datasources -============================================ -In case of secure opensearch domains, only admins and users with roles mentioned in datasource configuration are allowed to make queries. -For example: with below datasource configuration, only admins and users with prometheus_access role can run queries on my_prometheus datasource. :: - - { - "name" : "my_prometheus", - "connector": "prometheus", - "properties" : { - "prometheus.uri" : "http://localhost:8080", - "prometheus.auth.type" : "basicauth", - "prometheus.auth.username" : "admin", - "prometheus.auth.password" : "admin" - }, - "allowedRoles" : ["prometheus_access"] - } - - -Moving from keystore datasource configuration -============================================= -* In versions prior to 2.7, the plugins.query.federation.datasources.config key store setting was used to configure datasources, but it has been deprecated and will be removed in version 3.0. -* To port previously configured datasources from the keystore, users can use the `create datasource` REST API mentioned in the above section. - -Disabling a datasource to block new queries -=========================================== -* We can disable a datasource using PATCH or PUT API. Below is the example request for disabling a datasource named "my_prometheus" using PATCH API. :: - - PATCH https://localhost:9200/_plugins/_query/_datasources - content-type: application/json - Authorization: Basic {{username}} {{password}} - - { - "name" : "my_prometheus", - "status" : "disabled" - } - - -Metadata queries using information_schema -========================================= -Use ``information_schema`` in source command to query tables information under a datasource. -In the current state, ``information_schema`` only support metadata of tables. -This schema will be extended for views, columns and other metadata info in future. - -Syntax ------- -source = datasource.information_schema.tables; - -Example 1: Fetch tables in prometheus datasource ------------------------------------------------- - -The examples fetches tables in the prometheus datasource. - -PPL query for fetching PROMETHEUS TABLES with where clause:: - - PPL> source = my_prometheus.information_schema.tables | where TABLE_NAME='prometheus_http_requests_total' - fetched rows / total rows = 1/1 - +---------------+--------------+--------------------------------+------------+------+---------------------------+ - | TABLE_CATALOG | TABLE_SCHEMA | TABLE_NAME | TABLE_TYPE | UNIT | REMARKS | - |---------------+--------------+--------------------------------+------------+------+---------------------------| - | my_prometheus | default | prometheus_http_requests_total | counter | | Counter of HTTP requests. | - +---------------+--------------+--------------------------------+------------+------+---------------------------+ - - -Example 2: Search tables in prometheus datasource -------------------------------------------------- - -The examples searches tables in the prometheus datasource. - -PPL query for searching PROMETHEUS TABLES:: - - PPL> source = my_prometheus.information_schema.tables | where LIKE(TABLE_NAME, "%http%"); - fetched rows / total rows = 6/6 - +---------------+--------------+--------------------------------------------+------------+------+----------------------------------------------------+ - | TABLE_CATALOG | TABLE_SCHEMA | TABLE_NAME | TABLE_TYPE | UNIT | REMARKS | - |---------------+--------------+--------------------------------------------+------------+------+----------------------------------------------------| - | my_prometheus | default | prometheus_http_requests_total | counter | | Counter of HTTP requests. | - | my_prometheus | default | promhttp_metric_handler_requests_in_flight | gauge | | Current number of scrapes being served. | - | my_prometheus | default | prometheus_http_request_duration_seconds | histogram | | Histogram of latencies for HTTP requests. | - | my_prometheus | default | prometheus_sd_http_failures_total | counter | | Number of HTTP service discovery refresh failures. | - | my_prometheus | default | promhttp_metric_handler_requests_total | counter | | Total number of scrapes by HTTP status code. | - | my_prometheus | default | prometheus_http_response_size_bytes | histogram | | Histogram of response size for HTTP requests. | - +---------------+--------------+--------------------------------------------+------------+------+----------------------------------------------------+ - - -.. _datasources-prometheus-metadata: - -Fetch metadata for table in Prometheus datasource -================================================= - -After a Prometheus datasource is configured, you can inspect the schema of any metric by running the ``describe`` command against the fully qualified table name. For example:: - -PPL query:: - - PPL> describe my_prometheus.prometheus_http_requests_total; - fetched rows / total rows = 6/6 - +---------------+--------------+--------------------------------+-------------+-----------+ - | TABLE_CATALOG | TABLE_SCHEMA | TABLE_NAME | COLUMN_NAME | DATA_TYPE | - |---------------+--------------+--------------------------------+-------------+-----------| - | my_prometheus | default | prometheus_http_requests_total | handler | string | - | my_prometheus | default | prometheus_http_requests_total | code | string | - | my_prometheus | default | prometheus_http_requests_total | instance | string | - | my_prometheus | default | prometheus_http_requests_total | @timestamp | timestamp | - | my_prometheus | default | prometheus_http_requests_total | @value | double | - | my_prometheus | default | prometheus_http_requests_total | job | string | - +---------------+--------------+--------------------------------+-------------+-----------+ - -Limitations -=========== - -In using PPL, data sources except OpenSearch can only work with ``plugins.calcite.enabled=false``. -When Calcite is enabled, queries against non-OpenSearch data sources will implicit fallback to v2, which means new PPL commands/functions introduced in 3.0.0 and above cannot work together with non-OpenSearch data sources. diff --git a/docs/user/ppl/admin/monitoring.md b/docs/user/ppl/admin/monitoring.md new file mode 100644 index 00000000000..7a85a0bac01 --- /dev/null +++ b/docs/user/ppl/admin/monitoring.md @@ -0,0 +1,35 @@ +# Monitoring + +## Introduction + +By a stats endpoint, you are able to collect metrics for the plugin within the interval. Note that only node level statistics collecting is implemented for now. In other words, you only get the metrics for the node you're accessing. Cluster level statistics have yet to be implemented. + +## Node Stats + +### Description + +The meaning of fields in the response is as follows: + +| Field name | Description | +|------------|-------------| +| `ppl_request_total` | Total count of PPL request | +| `ppl_request_count` | Total count of PPL request within the interval | +| `ppl_failed_request_count_syserr` | Count of failed PPL request due to system error within the interval | +| `ppl_failed_request_count_cuserr` | Count of failed PPL request due to bad request within the interval | + +### Example + +```bash ignore +curl -H 'Content-Type: application/json' -X GET localhost:9200/_plugins/_ppl/stats +``` + +```json +{ + "ppl_request_total": 10, + "ppl_request_count": 2, + "ppl_failed_request_count_syserr": 0, + "ppl_failed_request_count_cuserr": 0, + ... +} +``` + \ No newline at end of file diff --git a/docs/user/ppl/admin/monitoring.rst b/docs/user/ppl/admin/monitoring.rst deleted file mode 100644 index 625b0411c49..00000000000 --- a/docs/user/ppl/admin/monitoring.rst +++ /dev/null @@ -1,56 +0,0 @@ -.. highlight:: sh - -========== -Monitoring -========== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 1 - - -Introduction -============ - -By a stats endpoint, you are able to collect metrics for the plugin within the interval. Note that only node level statistics collecting is implemented for now. In other words, you only get the metrics for the node you're accessing. Cluster level statistics have yet to be implemented. - -Node Stats -========== - -Description ------------ - -The meaning of fields in the response is as follows: - -+--------------------------------+-------------------------------------------------------------------+ -| Field name| Description| -+================================+===================================================================+ -| ppl_request_total| Total count of PPL request| -+--------------------------------+-------------------------------------------------------------------+ -| ppl_request_count| Total count of PPL request within the interval| -+--------------------------------+-------------------------------------------------------------------+ -| ppl_failed_request_count_syserr|Count of failed PPL request due to system error within the interval| -+--------------------------------+-------------------------------------------------------------------+ -| ppl_failed_request_count_cuserr| Count of failed PPL request due to bad request within the interval| -+--------------------------------+-------------------------------------------------------------------+ - - -Example -------- - -SQL query:: - - >> curl -H 'Content-Type: application/json' -X GET localhost:9200/_plugins/_ppl/stats - -Result set:: - - { - "ppl_request_total": 10, - "ppl_request_count": 2, - "ppl_failed_request_count_syserr": 0, - "ppl_failed_request_count_cuserr": 0, - ... - } - diff --git a/docs/user/ppl/admin/security.md b/docs/user/ppl/admin/security.md new file mode 100644 index 00000000000..d41f3b23e23 --- /dev/null +++ b/docs/user/ppl/admin/security.md @@ -0,0 +1,65 @@ +# Security Settings + +## Introduction + +User needs `cluster:admin/opensearch/ppl` permission to use PPL plugin. User also needs indices level permission `indices:admin/mappings/get` to get field mappings, `indices:monitor/settings/get` to get cluster settings, and `indices:data/read/search*` to search index. +## Using Rest API + +**--INTRODUCED 2.1--** + +Example: Create the ppl_role for test_user. then test_user could use PPL to query `ppl-security-demo` index. +1. Create the ppl_role and grant permission to access PPL plugin and access ppl-security-demo index + +```bash +PUT _plugins/_security/api/roles/ppl_role +{ + "cluster_permissions": [ + "cluster:admin/opensearch/ppl" + ], + "index_permissions": [{ + "index_patterns": [ + "ppl-security-demo" + ], + "allowed_actions": [ + "indices:data/read/search*", + "indices:admin/mappings/get", + "indices:monitor/settings/get" + ] + }] +} + +``` + +2. Mapping the test_user to the ppl_role + +```bash +PUT _plugins/_security/api/rolesmapping/ppl_role +{ + "backend_roles" : [], + "hosts" : [], + "users" : ["test_user"] +} + + +``` + +## Using Security Dashboard + +**--INTRODUCED 2.1--** + +Example: Create ppl_access permission and add to existing role +1. Create the ppl_access permission + +```bash +PUT _plugins/_security/api/actiongroups/ppl_access +{ + "allowed_actions": [ + "cluster:admin/opensearch/ppl" + ] +} + +``` + +2. Grant the ppl_access permission to ppl_test_role + +![Image](https://user-images.githubusercontent.com/2969395/185448976-6c0aed6b-7540-4b99-92c3-362da8ae3763.png) diff --git a/docs/user/ppl/admin/security.rst b/docs/user/ppl/admin/security.rst deleted file mode 100644 index e512cc259c2..00000000000 --- a/docs/user/ppl/admin/security.rst +++ /dev/null @@ -1,70 +0,0 @@ -.. highlight:: sh - -================= -Security Settings -================= - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 1 - -Introduction -============ - -User needs ``cluster:admin/opensearch/ppl`` permission to use PPL plugin. User also needs indices level permission ``indices:admin/mappings/get`` to get field mappings, ``indices:monitor/settings/get`` to get cluster settings, and ``indices:data/read/search*`` to search index. - -Using Rest API -============== -**--INTRODUCED 2.1--** - -Example: Create the ppl_role for test_user. then test_user could use PPL to query ``ppl-security-demo`` index. - -1. Create the ppl_role and grand permission to access PPL plugin and access ppl-security-demo index:: - - PUT _plugins/_security/api/roles/ppl_role - { - "cluster_permissions": [ - "cluster:admin/opensearch/ppl" - ], - "index_permissions": [{ - "index_patterns": [ - "ppl-security-demo" - ], - "allowed_actions": [ - "indices:data/read/search*", - "indices:admin/mappings/get", - "indices:monitor/settings/get" - ] - }] - } - -2. Mapping the test_user to the ppl_role:: - - PUT _plugins/_security/api/rolesmapping/ppl_role - { - "backend_roles" : [], - "hosts" : [], - "users" : ["test_user"] - } - - -Using Security Dashboard -======================== -**--INTRODUCED 2.1--** - -Example: Create ppl_access permission and add to existing role - -1. Create the ppl_access permission:: - - PUT _plugins/_security/api/actiongroups/ppl_access - { - "allowed_actions": [ - "cluster:admin/opensearch/ppl" - ] - } - -2. Grant the ppl_access permission to ppl_test_role - -.. image:: https://user-images.githubusercontent.com/2969395/185448976-6c0aed6b-7540-4b99-92c3-362da8ae3763.png diff --git a/docs/user/ppl/admin/settings.md b/docs/user/ppl/admin/settings.md new file mode 100644 index 00000000000..e521ab6c11f --- /dev/null +++ b/docs/user/ppl/admin/settings.md @@ -0,0 +1,441 @@ +# PPL Settings + +## Introduction + +When OpenSearch bootstraps, PPL plugin will register a few settings in OpenSearch cluster settings. Most of the settings are able to change dynamically so you can control the behavior of PPL plugin without need to bounce your cluster. +## plugins.ppl.enabled + +### Description + +You can disable SQL plugin to reject all coming requests. +1. The default value is true. +2. This setting is node scope. +3. This setting can be updated dynamically. + +Notes. Calls to _plugins/_ppl include index names in the request body, so they have the same access policy considerations as the bulk, mget, and msearch operations. if rest.action.multi.allow_explicit_index set to false, PPL plugin will be disabled. +### Example 1 + +You can update the setting with a new value like this. + +```bash ppl +curl -sS -H 'Content-Type: application/json' \ +-X PUT localhost:9200/_plugins/_query/settings \ +-d '{"transient" : {"plugins.ppl.enabled" : "false"}}' +``` + +Expected output: + +```json +{ + "acknowledged": true, + "persistent": {}, + "transient": { + "plugins": { + "ppl": { + "enabled": "false" + } + } + } +} + +``` + +Note: the legacy settings of `opendistro.ppl.enabled` is deprecated, it will fallback to the new settings if you request an update with the legacy name. +### Example 2 + +Query result after the setting updated is like: + +```bash ppl +curl -sS -H 'Content-Type: application/json' \ +-X POST localhost:9200/_plugins/_ppl \ +-d '{"query": "source=my_prometheus"}' +``` + +Expected output: + +```json +{ + "error": { + "reason": "Invalid Query", + "details": "Either plugins.ppl.enabled or rest.action.multi.allow_explicit_index setting is false", + "type": "IllegalAccessException" + }, + "status": 400 +} + +``` + +## plugins.ppl.query.timeout + +### Description + +This setting controls the maximum execution time for PPL queries. When a query exceeds this timeout, it will be interrupted and return a timeout error. +1. The default value is 300s (5 minutes). +2. This setting is node scope. +3. This setting can be updated dynamically. + +### Example + +You can configure the query timeout: + +```bash ppl +curl -sS -H 'Content-Type: application/json' \ +-X PUT localhost:9200/_plugins/_query/settings \ +-d '{"transient" : {"plugins.ppl.query.timeout" : "60s"}}' +``` + +Expected output: + +```json +{ + "acknowledged": true, + "persistent": {}, + "transient": { + "plugins": { + "ppl": { + "query": { + "timeout": "60s" + } + } + } + } +} +``` + +## plugins.query.memory_limit + +### Description + +You can set heap memory usage limit for the query engine. When query running, it will detected whether the heap memory usage under the limit, if not, it will terminated the current query. The default value is: 85% +### Example + +```bash ppl +curl -sS -H 'Content-Type: application/json' \ +-X PUT localhost:9200/_plugins/_query/settings \ +-d '{"persistent" : {"plugins.query.memory_limit" : "80%"}}' +``` + +Expected output: + +```json +{ + "acknowledged": true, + "persistent": { + "plugins": { + "query": { + "memory_limit": "80%" + } + } + }, + "transient": {} +} + +``` + +Note: the legacy settings of `opendistro.ppl.query.memory_limit` is deprecated, it will fallback to the new settings if you request an update with the legacy name. +## plugins.query.size_limit + +### Description + +The size configures the maximum amount of rows to be fetched from PPL execution results. The default value is: 10000 +### Example + +Change the size_limit to 1000 + +```bash ppl +curl -sS -H 'Content-Type: application/json' \ +-X PUT localhost:9200/_plugins/_query/settings \ +-d '{"persistent" : {"plugins.query.size_limit" : "1000"}}' +``` + +Expected output: + +```json +{ + "acknowledged": true, + "persistent": { + "plugins": { + "query": { + "size_limit": "1000" + } + } + }, + "transient": {} +} +``` + +Note: the legacy settings of `opendistro.query.size_limit` is deprecated, it will fallback to the new settings if you request an update with the legacy name. +## plugins.query.buckets + +### Version + +3.4.0 +### Description + +This configuration indicates how many aggregation buckets will return in a single response. The default value equals to `plugins.query.size_limit`. +You can change the value to any value not greater than the maximum number of aggregation buckets allowed in a single response (`search.max_buckets`), here is an example + +```bash ppl +curl -sS -H 'Content-Type: application/json' -X PUT localhost:9200/_plugins/_query/settings -d '{ + "transient" : { + "plugins.query.buckets" : 1000 + } +}' +``` + +Expected output: + +```json +{ + "acknowledged": true, + "persistent": {}, + "transient": { + "plugins": { + "query": { + "buckets": "1000" + } + } + } +} +``` + +### Limitations + +The number of aggregation buckets is fixed to `1000` in v2. `plugins.query.buckets` can only effect the number of aggregation buckets when calcite enabled. +## plugins.calcite.all_join_types.allowed + +### Description + +Since 3.3.0, join types `inner`, `left`, `outer` (alias of `left`), `semi` and `anti` are supported by default. `right`, `full`, `cross` are performance sensitive join types which are disabled by default. Set config `plugins.calcite.all_join_types.allowed = true` to enable. +### Example + +```bash ppl +curl -sS -H 'Content-Type: application/json' \ +-X PUT localhost:9200/_plugins/_query/settings \ +-d '{"transient" : {"plugins.calcite.all_join_types.allowed" : "true"}}' +``` + +Expected output: + +```json +{ + "acknowledged": true, + "persistent": {}, + "transient": { + "plugins": { + "calcite": { + "all_join_types": { + "allowed": "true" + } + } + } + } +} +``` + +## plugins.ppl.syntax.legacy.preferred + +### Description + +This configuration is introduced since 3.3.0 which is used to switch some behaviours in PPL syntax. The current default value is `true`. +The behaviours it controlled includes: +- The default value of argument `bucket_nullable` in `stats` command. Check [stats command](../cmd/stats.md) for details. +- The return value of `divide` and `/` operator. Check [expressions](../functions/expressions.md) for details. +- The default value of argument `usenull` in `top` and `rare` commands. Check [top command](../cmd/top.md) and [rare command](../cmd/rare.md) for details. + +### Example 1 + +You can update the setting with a new value like this. + +```bash ppl +curl -sS -H 'Content-Type: application/json' \ +-X PUT localhost:9200/_plugins/_query/settings \ +-d '{"transient" : {"plugins.ppl.syntax.legacy.preferred" : "false"}}' +``` + +Expected output: + +```json +{ + "acknowledged": true, + "persistent": {}, + "transient": { + "plugins": { + "ppl": { + "syntax": { + "legacy": { + "preferred": "false" + } + } + } + } + } +} + +``` + +### Example 2 + +Reset to default (true) by setting to null: + +```bash ppl +curl -sS -H 'Content-Type: application/json' \ +-X PUT localhost:9200/_plugins/_query/settings \ +-d '{"transient" : {"plugins.ppl.syntax.legacy.preferred" : null}}' +``` + +Expected output: + +```json +{ + "acknowledged": true, + "persistent": {}, + "transient": {} +} +``` + +## plugins.ppl.values.max.limit + +### Description + +This setting controls the maximum number of unique values that the `VALUES` aggregation function can return. When set to 0 (the default), there is no limit on the number of unique values returned. When set to a positive integer, the function will return at most that many unique values. +1. The default value is 0 (unlimited). +2. This setting is node scope. +3. This setting can be updated dynamically. + +The `VALUES` function collects all unique values from a field and returns them in lexicographical order. This setting helps manage memory usage by limiting the number of values collected. +### Example 1 + +Set the limit to 1000 unique values: + +```bash ppl +curl -sS -H 'Content-Type: application/json' \ +-X PUT localhost:9200/_plugins/_query/settings \ +-d '{"transient" : {"plugins.ppl.values.max.limit" : "1000"}}' +``` + +Expected output: + +```json +{ + "acknowledged": true, + "persistent": {}, + "transient": { + "plugins": { + "ppl": { + "values": { + "max": { + "limit": "1000" + } + } + } + } + } +} + +``` + +### Example 2 + +Set to 0 explicitly for unlimited values: + +```bash ppl +curl -sS -H 'Content-Type: application/json' \ +-X PUT localhost:9200/_plugins/_query/settings \ +-d '{"transient" : {"plugins.ppl.values.max.limit" : "0"}}' +``` + +Expected output: + +```json +{ + "acknowledged": true, + "persistent": {}, + "transient": { + "plugins": { + "ppl": { + "values": { + "max": { + "limit": "0" + } + } + } + } + } +} + + +``` + +## plugins.ppl.subsearch.maxout + +### Description + +The size configures the maximum of rows to return from subsearch. The default value is: `10000`. A value of `0` indicates that the restriction is unlimited. +### Version + +3.4.0 +### Example + +Change the subsearch.maxout to unlimited + +```bash ppl +curl -sS -H 'Content-Type: application/json' \ +-X PUT localhost:9200/_plugins/_query/settings \ +-d '{"persistent" : {"plugins.ppl.subsearch.maxout" : "0"}}' +``` + +Expected output: + +```json +{ + "acknowledged": true, + "persistent": { + "plugins": { + "ppl": { + "subsearch": { + "maxout": "0" + } + } + } + }, + "transient": {} +} +``` + +## plugins.ppl.join.subsearch_maxout + +### Description + +The size configures the maximum of rows from subsearch to join against. This configuration impacts `join` command. The default value is: `50000`. A value of `0` indicates that the restriction is unlimited. +### Version + +3.4.0 +### Example + +Change the join.subsearch_maxout to 5000 + +```bash ppl +curl -sS -H 'Content-Type: application/json' \ +-X PUT localhost:9200/_plugins/_query/settings \ +-d '{"persistent" : {"plugins.ppl.join.subsearch_maxout" : "5000"}}' +``` + +Expected output: + +```json +{ + "acknowledged": true, + "persistent": { + "plugins": { + "ppl": { + "join": { + "subsearch_maxout": "5000" + } + } + } + }, + "transient": {} +} +``` + \ No newline at end of file diff --git a/docs/user/ppl/admin/settings.rst b/docs/user/ppl/admin/settings.rst deleted file mode 100644 index ef9eba207fa..00000000000 --- a/docs/user/ppl/admin/settings.rst +++ /dev/null @@ -1,427 +0,0 @@ -.. highlight:: sh - -============ -PPL Settings -============ - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 1 - - -Introduction -============ - -When OpenSearch bootstraps, PPL plugin will register a few settings in OpenSearch cluster settings. Most of the settings are able to change dynamically so you can control the behavior of PPL plugin without need to bounce your cluster. - -plugins.ppl.enabled -=================== - -Description ------------ - -You can disable SQL plugin to reject all coming requests. - -1. The default value is true. -2. This setting is node scope. -3. This setting can be updated dynamically. - -Notes. Calls to _plugins/_ppl include index names in the request body, so they have the same access policy considerations as the bulk, mget, and msearch operations. if rest.action.multi.allow_explicit_index set to false, PPL plugin will be disabled. - -Example 1 ---------- - -You can update the setting with a new value like this. - -PPL query:: - - sh$ curl -sS -H 'Content-Type: application/json' \ - ... -X PUT localhost:9200/_plugins/_query/settings \ - ... -d '{"transient" : {"plugins.ppl.enabled" : "false"}}' - { - "acknowledged": true, - "persistent": {}, - "transient": { - "plugins": { - "ppl": { - "enabled": "false" - } - } - } - } - -Note: the legacy settings of ``opendistro.ppl.enabled`` is deprecated, it will fallback to the new settings if you request an update with the legacy name. - -Example 2 ---------- - -Query result after the setting updated is like: - -PPL query:: - - sh$ curl -sS -H 'Content-Type: application/json' \ - ... -X POST localhost:9200/_plugins/_ppl \ - ... -d '{"query": "source=my_prometheus"}' - { - "error": { - "reason": "Invalid Query", - "details": "Either plugins.ppl.enabled or rest.action.multi.allow_explicit_index setting is false", - "type": "IllegalAccessException" - }, - "status": 400 - } - -plugins.ppl.query.timeout -========================= - -Description ------------ - -This setting controls the maximum execution time for PPL queries. When a query exceeds this timeout, it will be interrupted and return a timeout error. - -1. The default value is 300s (5 minutes). -2. This setting is node scope. -3. This setting can be updated dynamically. - -Example -------- - -You can configure the query timeout: - -PPL query:: - - sh$ curl -sS -H 'Content-Type: application/json' \ - ... -X PUT localhost:9200/_plugins/_query/settings \ - ... -d '{"transient" : {"plugins.ppl.query.timeout" : "60s"}}' - { - "acknowledged": true, - "persistent": {}, - "transient": { - "plugins": { - "ppl": { - "query": { - "timeout": "60s" - } - } - } - } - } - -plugins.query.memory_limit -========================== - -Description ------------ - -You can set heap memory usage limit for the query engine. When query running, it will detected whether the heap memory usage under the limit, if not, it will terminated the current query. The default value is: 85% - -Example -------- - -PPL query:: - - sh$ curl -sS -H 'Content-Type: application/json' \ - ... -X PUT localhost:9200/_plugins/_query/settings \ - ... -d '{"persistent" : {"plugins.query.memory_limit" : "80%"}}' - { - "acknowledged": true, - "persistent": { - "plugins": { - "query": { - "memory_limit": "80%" - } - } - }, - "transient": {} - } - -Note: the legacy settings of ``opendistro.ppl.query.memory_limit`` is deprecated, it will fallback to the new settings if you request an update with the legacy name. - -plugins.query.size_limit -======================== - -Description ------------ - -The size configures the maximum amount of rows to be fetched from PPL execution results. The default value is: 10000 - -Example -------- - -Change the size_limit to 1000:: - - sh$ curl -sS -H 'Content-Type: application/json' \ - ... -X PUT localhost:9200/_plugins/_query/settings \ - ... -d '{"persistent" : {"plugins.query.size_limit" : "1000"}}' - { - "acknowledged": true, - "persistent": { - "plugins": { - "query": { - "size_limit": "1000" - } - } - }, - "transient": {} - } - -Note: the legacy settings of ``opendistro.query.size_limit`` is deprecated, it will fallback to the new settings if you request an update with the legacy name. - -plugins.query.buckets -===================== - -Version -------- -3.4.0 - -Description ------------ - -This configuration indicates how many aggregation buckets will return in a single response. The default value equals to ``plugins.query.size_limit``. -You can change the value to any value not greater than the maximum number of aggregation buckets allowed in a single response (`search.max_buckets`), here is an example:: - - >> curl -H 'Content-Type: application/json' -X PUT localhost:9200/_plugins/_query/settings -d '{ - "transient" : { - "plugins.query.buckets" : 1000 - } - }' - -Result set:: - - { - "acknowledged" : true, - "persistent" : { }, - "transient" : { - "plugins" : { - "query" : { - "buckets" : "1000" - } - } - } - } - -Limitations ------------ -The number of aggregation buckets is fixed to ``1000`` in v2. ``plugins.query.buckets`` can only effect the number of aggregation buckets when calcite enabled. - -plugins.calcite.all_join_types.allowed -====================================== - -Description ------------ - -Since 3.3.0, join types ``inner``, ``left``, ``outer`` (alias of ``left``), ``semi`` and ``anti`` are supported by default. ``right``, ``full``, ``cross`` are performance sensitive join types which are disabled by default. Set config ``plugins.calcite.all_join_types.allowed = true`` to enable. - -Example -------- - -PPL query:: - - sh$ curl -sS -H 'Content-Type: application/json' \ - ... -X PUT localhost:9200/_plugins/_query/settings \ - ... -d '{"transient" : {"plugins.calcite.all_join_types.allowed" : "true"}}' - { - "acknowledged": true, - "persistent": {}, - "transient": { - "plugins": { - "calcite": { - "all_join_types": { - "allowed": "true" - } - } - } - } - } - -plugins.ppl.syntax.legacy.preferred -=================================== - -Description ------------ - -This configuration is introduced since 3.3.0 which is used to switch some behaviours in PPL syntax. The current default value is ``true``. -The behaviours it controlled includes: - -- The default value of argument ``bucket_nullable`` in ``stats`` command. Check `stats command <../cmd/stats.rst>`_ for details. -- The return value of ``divide`` and ``/`` operator. Check `expressions <../functions/expressions.rst>`_ for details. -- The default value of argument ``usenull`` in ``top`` and ``rare`` commands. Check `top command <../cmd/top.rst>`_ and `rare command <../cmd/rare.rst>`_ for details. - -Example 1 -------- - -You can update the setting with a new value like this. - -PPL query:: - - sh$ curl -sS -H 'Content-Type: application/json' \ - ... -X PUT localhost:9200/_plugins/_query/settings \ - ... -d '{"transient" : {"plugins.ppl.syntax.legacy.preferred" : "false"}}' - { - "acknowledged": true, - "persistent": {}, - "transient": { - "plugins": { - "ppl": { - "syntax": { - "legacy": { - "preferred": "false" - } - } - } - } - } - } - -Example 2 ---------- - -Reset to default (true) by setting to null: - -PPL query:: - - sh$ curl -sS -H 'Content-Type: application/json' \ - ... -X PUT localhost:9200/_plugins/_query/settings \ - ... -d '{"transient" : {"plugins.ppl.syntax.legacy.preferred" : null}}' - { - "acknowledged": true, - "persistent": {}, - "transient": {} - } - -plugins.ppl.values.max.limit -============================ - -Description ------------ - -This setting controls the maximum number of unique values that the ``VALUES`` aggregation function can return. When set to 0 (the default), there is no limit on the number of unique values returned. When set to a positive integer, the function will return at most that many unique values. - -1. The default value is 0 (unlimited). -2. This setting is node scope. -3. This setting can be updated dynamically. - -The ``VALUES`` function collects all unique values from a field and returns them in lexicographical order. This setting helps manage memory usage by limiting the number of values collected. - -Example 1 ---------- - -Set the limit to 1000 unique values: - -PPL query:: - - sh$ curl -sS -H 'Content-Type: application/json' \ - ... -X PUT localhost:9200/_plugins/_query/settings \ - ... -d '{"transient" : {"plugins.ppl.values.max.limit" : "1000"}}' - { - "acknowledged": true, - "persistent": {}, - "transient": { - "plugins": { - "ppl": { - "values": { - "max": { - "limit": "1000" - } - } - } - } - } - } - -Example 2 ---------- - -Set to 0 explicitly for unlimited values: - -PPL query:: - - sh$ curl -sS -H 'Content-Type: application/json' \ - ... -X PUT localhost:9200/_plugins/_query/settings \ - ... -d '{"transient" : {"plugins.ppl.values.max.limit" : "0"}}' - { - "acknowledged": true, - "persistent": {}, - "transient": { - "plugins": { - "ppl": { - "values": { - "max": { - "limit": "0" - } - } - } - } - } - } - - -plugins.ppl.subsearch.maxout -============================ - -Description ------------ - -The size configures the maximum of rows to return from subsearch. The default value is: ``10000``. A value of ``0`` indicates that the restriction is unlimited. - -Version -------- -3.4.0 - -Example -------- - -Change the subsearch.maxout to unlimited:: - - sh$ curl -sS -H 'Content-Type: application/json' \ - ... -X PUT localhost:9200/_plugins/_query/settings \ - ... -d '{"persistent" : {"plugins.ppl.subsearch.maxout" : "0"}}' - { - "acknowledged": true, - "persistent": { - "plugins": { - "ppl": { - "subsearch": { - "maxout": "0" - } - } - } - }, - "transient": {} - } - -plugins.ppl.join.subsearch_maxout -================================= - -Description ------------ - -The size configures the maximum of rows from subsearch to join against. This configuration impacts ``join`` command. The default value is: ``50000``. A value of ``0`` indicates that the restriction is unlimited. - -Version -------- -3.4.0 - -Example -------- - -Change the join.subsearch_maxout to 5000:: - - sh$ curl -sS -H 'Content-Type: application/json' \ - ... -X PUT localhost:9200/_plugins/_query/settings \ - ... -d '{"persistent" : {"plugins.ppl.join.subsearch_maxout" : "5000"}}' - { - "acknowledged": true, - "persistent": { - "plugins": { - "ppl": { - "join": { - "subsearch_maxout": "5000" - } - } - } - }, - "transient": {} - } diff --git a/docs/user/ppl/cmd/ad.md b/docs/user/ppl/cmd/ad.md new file mode 100644 index 00000000000..6d18396506b --- /dev/null +++ b/docs/user/ppl/cmd/ad.md @@ -0,0 +1,124 @@ +# ad (deprecated by ml command) + +## Description + +The `ad` command applies Random Cut Forest (RCF) algorithm in the ml-commons plugin on the search result returned by a PPL command. Based on the input, the command uses two types of RCF algorithms: fixed-in-time RCF for processing time-series data, batch RCF for processing non-time-series data. +## Syntax + +## Fixed In Time RCF For Time-series Data + +ad [number_of_trees] [shingle_size] [sample_size] [output_after] [time_decay] [anomaly_rate] \ [date_format] [time_zone] [category_field] +* number_of_trees: optional. Number of trees in the forest. **Default:** 30. +* shingle_size: optional. A shingle is a consecutive sequence of the most recent records. **Default:** 8. +* sample_size: optional. The sample size used by stream samplers in this forest. **Default:** 256. +* output_after: optional. The number of points required by stream samplers before results are returned. **Default:** 32. +* time_decay: optional. The decay factor used by stream samplers in this forest. **Default:** 0.0001. +* anomaly_rate: optional. The anomaly rate. **Default:** 0.005. +* time_field: mandatory. Specifies the time field for RCF to use as time-series data. +* date_format: optional. Used for formatting time_field. **Default:** "yyyy-MM-dd HH:mm:ss". +* time_zone: optional. Used for setting time zone for time_field. **Default:** "UTC". +* category_field: optional. Specifies the category field used to group inputs. Each category will be independently predicted. + +## Batch RCF For Non-time-series Data + +ad [number_of_trees] [sample_size] [output_after] [training_data_size] [anomaly_score_threshold] [category_field] +* number_of_trees: optional. Number of trees in the forest. **Default:** 30. +* sample_size: optional. Number of random samples given to each tree from the training data set. **Default:** 256. +* output_after: optional. The number of points required by stream samplers before results are returned. **Default:** 32. +* training_data_size: optional. **Default:** size of your training data set. +* anomaly_score_threshold: optional. The threshold of anomaly score. **Default:** 1.0. +* category_field: optional. Specifies the category field used to group inputs. Each category will be independently predicted. + +## Example 1: Detecting events in New York City from taxi ridership data with time-series data + +This example trains an RCF model and uses the model to detect anomalies in the time-series ridership data. + +```ppl ignore +source=nyc_taxi +| fields value, timestamp +| AD time_field='timestamp' +| where value=10844.0 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------+---------------------+-------+---------------+ +| value | timestamp | score | anomaly_grade | +|---------+---------------------+-------+---------------| +| 10844.0 | 2014-07-01 00:00:00 | 0.0 | 0.0 | ++---------+---------------------+-------+---------------+ +``` + +## Example 2: Detecting events in New York City from taxi ridership data with time-series data independently with each category + +This example trains an RCF model and uses the model to detect anomalies in the time-series ridership data with multiple category values. + +```ppl ignore +source=nyc_taxi +| fields category, value, timestamp +| AD time_field='timestamp' category_field='category' +| where value=10844.0 or value=6526.0 +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++----------+---------+---------------------+-------+---------------+ +| category | value | timestamp | score | anomaly_grade | +|----------+---------+---------------------+-------+---------------| +| night | 10844.0 | 2014-07-01 00:00:00 | 0.0 | 0.0 | +| day | 6526.0 | 2014-07-01 06:00:00 | 0.0 | 0.0 | ++----------+---------+---------------------+-------+---------------+ +``` + +## Example 3: Detecting events in New York City from taxi ridership data with non-time-series data + +This example trains an RCF model and uses the model to detect anomalies in the non-time-series ridership data. + +```ppl ignore +source=nyc_taxi +| fields value +| AD +| where value=10844.0 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------+-------+-----------+ +| value | score | anomalous | +|---------+-------+-----------| +| 10844.0 | 0.0 | False | ++---------+-------+-----------+ +``` + +## Example 4: Detecting events in New York City from taxi ridership data with non-time-series data independently with each category + +This example trains an RCF model and uses the model to detect anomalies in the non-time-series ridership data with multiple category values. + +```ppl ignore +source=nyc_taxi +| fields category, value +| AD category_field='category' +| where value=10844.0 or value=6526.0 +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++----------+---------+-------+-----------+ +| category | value | score | anomalous | +|----------+---------+-------+-----------| +| night | 10844.0 | 0.0 | False | +| day | 6526.0 | 0.0 | False | ++----------+---------+-------+-----------+ +``` + +## Limitations + +The `ad` command can only work with `plugins.calcite.enabled=false`. \ No newline at end of file diff --git a/docs/user/ppl/cmd/ad.rst b/docs/user/ppl/cmd/ad.rst deleted file mode 100644 index 26502dea682..00000000000 --- a/docs/user/ppl/cmd/ad.rst +++ /dev/null @@ -1,112 +0,0 @@ -============================= -ad (deprecated by ml command) -============================= - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| The ``ad`` command applies Random Cut Forest (RCF) algorithm in the ml-commons plugin on the search result returned by a PPL command. Based on the input, the command uses two types of RCF algorithms: fixed in time RCF for processing time-series data, batch RCF for processing non-time-series data. - - -Syntax -====== - -Fixed In Time RCF For Time-series Data --------------------------------------- -ad [number_of_trees] [shingle_size] [sample_size] [output_after] [time_decay] [anomaly_rate] [date_format] [time_zone] [category_field] - -* number_of_trees: optional. Number of trees in the forest. **Default:** 30. -* shingle_size: optional. A shingle is a consecutive sequence of the most recent records. **Default:** 8. -* sample_size: optional. The sample size used by stream samplers in this forest. **Default:** 256. -* output_after: optional. The number of points required by stream samplers before results are returned. **Default:** 32. -* time_decay: optional. The decay factor used by stream samplers in this forest. **Default:** 0.0001. -* anomaly_rate: optional. The anomaly rate. **Default:** 0.005. -* time_field: mandatory. Specifies the time field for RCF to use as time-series data. -* date_format: optional. Used for formatting time_field. **Default:** "yyyy-MM-dd HH:mm:ss". -* time_zone: optional. Used for setting time zone for time_field. **Default:** "UTC". -* category_field: optional. Specifies the category field used to group inputs. Each category will be independently predicted. - -Batch RCF For Non-time-series Data ----------------------------------- -ad [number_of_trees] [sample_size] [output_after] [training_data_size] [anomaly_score_threshold] [category_field] - -* number_of_trees: optional. Number of trees in the forest. **Default:** 30. -* sample_size: optional. Number of random samples given to each tree from the training data set. **Default:** 256. -* output_after: optional. The number of points required by stream samplers before results are returned. **Default:** 32. -* training_data_size: optional. **Default:** size of your training data set. -* anomaly_score_threshold: optional. The threshold of anomaly score. **Default:** 1.0. -* category_field: optional. Specifies the category field used to group inputs. Each category will be independently predicted. - -Example 1: Detecting events in New York City from taxi ridership data with time-series data -=========================================================================================== - -This example trains an RCF model and uses the model to detect anomalies in the time-series ridership data. - -PPL query:: - - > source=nyc_taxi | fields value, timestamp | AD time_field='timestamp' | where value=10844.0 - fetched rows / total rows = 1/1 - +---------+---------------------+-------+---------------+ - | value | timestamp | score | anomaly_grade | - |---------+---------------------+-------+---------------| - | 10844.0 | 2014-07-01 00:00:00 | 0.0 | 0.0 | - +---------+---------------------+-------+---------------+ - -Example 2: Detecting events in New York City from taxi ridership data with time-series data independently with each category -============================================================================================================================ - -This example trains an RCF model and uses the model to detect anomalies in the time-series ridership data with multiple category values. - -PPL query:: - - > source=nyc_taxi | fields category, value, timestamp | AD time_field='timestamp' category_field='category' | where value=10844.0 or value=6526.0 - fetched rows / total rows = 2/2 - +----------+---------+---------------------+-------+---------------+ - | category | value | timestamp | score | anomaly_grade | - |----------+---------+---------------------+-------+---------------| - | night | 10844.0 | 2014-07-01 00:00:00 | 0.0 | 0.0 | - | day | 6526.0 | 2014-07-01 06:00:00 | 0.0 | 0.0 | - +----------+---------+---------------------+-------+---------------+ - - -Example 3: Detecting events in New York City from taxi ridership data with non-time-series data -=============================================================================================== - -This example trains an RCF model and uses the model to detect anomalies in the non-time-series ridership data. - -PPL query:: - - > source=nyc_taxi | fields value | AD | where value=10844.0 - fetched rows / total rows = 1/1 - +---------+-------+-----------+ - | value | score | anomalous | - |---------+-------+-----------| - | 10844.0 | 0.0 | False | - +---------+-------+-----------+ - -Example 4: Detecting events in New York City from taxi ridership data with non-time-series data independently with each category -================================================================================================================================ - -This example trains an RCF model and uses the model to detect anomalies in the non-time-series ridership data with multiple category values. - -PPL query:: - - > source=nyc_taxi | fields category, value | AD category_field='category' | where value=10844.0 or value=6526.0 - fetched rows / total rows = 2/2 - +----------+---------+-------+-----------+ - | category | value | score | anomalous | - |----------+---------+-------+-----------| - | night | 10844.0 | 0.0 | False | - | day | 6526.0 | 0.0 | False | - +----------+---------+-------+-----------+ - - -Limitations -=========== -The ``ad`` command can only work with ``plugins.calcite.enabled=false``. diff --git a/docs/user/ppl/cmd/append.md b/docs/user/ppl/cmd/append.md new file mode 100644 index 00000000000..6c765286c69 --- /dev/null +++ b/docs/user/ppl/cmd/append.md @@ -0,0 +1,63 @@ +# append + +## Description + +The `append` command appends the result of a sub-search and attaches it as additional rows to the bottom of the input search results (The main search). +The command aligns columns with the same field names and types. For different column fields between the main search and sub-search, NULL values are filled in the respective rows. +## Syntax + +append \ +* sub-search: mandatory. Executes PPL commands as a secondary search. + +## Limitations + +* **Schema Compatibility**: When fields with the same name exist between the main search and sub-search but have incompatible types, the query will fail with an error. To avoid type conflicts, ensure that fields with the same name have the same data type, or use different field names (e.g., by renaming with `eval` or using `fields` to select non-conflicting columns). + +## Example 1: Append rows from a count aggregation to existing search result + +This example appends rows from "count by gender" to "sum by gender, state". + +```ppl +source=accounts | stats sum(age) by gender, state | sort -`sum(age)` | head 5 | append [ source=accounts | stats count(age) by gender ] +``` + +Expected output: + +```text +fetched rows / total rows = 6/6 ++----------+--------+-------+------------+ +| sum(age) | gender | state | count(age) | +|----------+--------+-------+------------| +| 36 | M | TN | null | +| 33 | M | MD | null | +| 32 | M | IL | null | +| 28 | F | VA | null | +| null | F | null | 1 | +| null | M | null | 3 | ++----------+--------+-------+------------+ +``` + +## Example 2: Append rows with merged column names + +This example appends rows from "sum by gender" to "sum by gender, state" with merged column of same field name and type. + +```ppl +source=accounts | stats sum(age) as sum by gender, state | sort -sum | head 5 | append [ source=accounts | stats sum(age) as sum by gender ] +``` + +Expected output: + +```text +fetched rows / total rows = 6/6 ++-----+--------+-------+ +| sum | gender | state | +|-----+--------+-------| +| 36 | M | TN | +| 33 | M | MD | +| 32 | M | IL | +| 28 | F | VA | +| 28 | F | null | +| 101 | M | null | ++-----+--------+-------+ +``` + \ No newline at end of file diff --git a/docs/user/ppl/cmd/append.rst b/docs/user/ppl/cmd/append.rst deleted file mode 100644 index 6afdda6e439..00000000000 --- a/docs/user/ppl/cmd/append.rst +++ /dev/null @@ -1,66 +0,0 @@ -====== -append -====== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| The ``append`` command appends the result of a sub-search and attaches it as additional rows to the bottom of the input search results (The main search). -| The command aligns columns with the same field names and types. For different column fields between the main search and sub-search, NULL values are filled in the respective rows. - -Syntax -====== -append - -* sub-search: mandatory. Executes PPL commands as a secondary search. - -Limitations -=========== - -* **Schema Compatibility**: When fields with the same name exist between the main search and sub-search but have incompatible types, the query will fail with an error. To avoid type conflicts, ensure that fields with the same name have the same data type, or use different field names (e.g., by renaming with ``eval`` or using ``fields`` to select non-conflicting columns). - -Example 1: Append rows from a count aggregation to existing search result -========================================================================= - -This example appends rows from "count by gender" to "sum by gender, state". - -PPL query:: - - os> source=accounts | stats sum(age) by gender, state | sort -`sum(age)` | head 5 | append [ source=accounts | stats count(age) by gender ]; - fetched rows / total rows = 6/6 - +----------+--------+-------+------------+ - | sum(age) | gender | state | count(age) | - |----------+--------+-------+------------| - | 36 | M | TN | null | - | 33 | M | MD | null | - | 32 | M | IL | null | - | 28 | F | VA | null | - | null | F | null | 1 | - | null | M | null | 3 | - +----------+--------+-------+------------+ - -Example 2: Append rows with merged column names -=============================================== - -This example appends rows from "sum by gender" to "sum by gender, state" with merged column of same field name and type. - -PPL query:: - - os> source=accounts | stats sum(age) as sum by gender, state | sort -sum | head 5 | append [ source=accounts | stats sum(age) as sum by gender ]; - fetched rows / total rows = 6/6 - +-----+--------+-------+ - | sum | gender | state | - |-----+--------+-------| - | 36 | M | TN | - | 33 | M | MD | - | 32 | M | IL | - | 28 | F | VA | - | 28 | F | null | - | 101 | M | null | - +-----+--------+-------+ diff --git a/docs/user/ppl/cmd/appendcol.md b/docs/user/ppl/cmd/appendcol.md new file mode 100644 index 00000000000..fb879c1b6f6 --- /dev/null +++ b/docs/user/ppl/cmd/appendcol.md @@ -0,0 +1,126 @@ +# appendcol + +## Description + +The `appendcol` command appends the result of a sub-search and attaches it alongside with the input search results (The main search). +## Syntax + +appendcol [override=\] \ +* override=: optional. Boolean field to specify should result from main-result be overwritten in the case of column name conflict. **Default:** false. +* sub-search: mandatory. Executes PPL commands as a secondary search. The sub-search uses the same data specified in the source clause of the main search results as its input. + +## Example 1: Append a count aggregation to existing search result + +This example appends "count by gender" to "sum by gender, state". + +```ppl +source=accounts +| stats sum(age) by gender, state +| appendcol [ stats count(age) by gender ] +| head 10 +``` + +Expected output: + +```text +fetched rows / total rows = 10/10 ++--------+-------+----------+------------+ +| gender | state | sum(age) | count(age) | +|--------+-------+----------+------------| +| F | AK | 317 | 493 | +| F | AL | 397 | 507 | +| F | AR | 229 | NULL | +| F | AZ | 238 | NULL | +| F | CA | 282 | NULL | +| F | CO | 217 | NULL | +| F | CT | 147 | NULL | +| F | DC | 358 | NULL | +| F | DE | 101 | NULL | +| F | FL | 310 | NULL | ++--------+-------+----------+------------+ +``` + +## Example 2: Append a count aggregation to existing search result with override option + +This example appends "count by gender" to "sum by gender, state" with override option. + +```ppl +source=accounts +| stats sum(age) by gender, state +| appendcol override=true [ stats count(age) by gender ] +| head 10 +``` + +Expected output: + +```text +fetched rows / total rows = 10/10 ++--------+-------+----------+------------+ +| gender | state | sum(age) | count(age) | +|--------+-------+----------+------------| +| F | AK | 317 | 493 | +| M | AL | 397 | 507 | +| F | AR | 229 | NULL | +| F | AZ | 238 | NULL | +| F | CA | 282 | NULL | +| F | CO | 217 | NULL | +| F | CT | 147 | NULL | +| F | DC | 358 | NULL | +| F | DE | 101 | NULL | +| F | FL | 310 | NULL | ++--------+-------+----------+------------+ +``` + +## Example 3: Append multiple sub-search results + +This example shows how to chain multiple appendcol commands to add columns from different sub-searches. + +```ppl +source=employees +| fields name, dept, age +| appendcol [ stats avg(age) as avg_age ] +| appendcol [ stats max(age) as max_age ] +``` + +Expected output: + +```text +fetched rows / total rows = 9/9 ++------+-------------+-----+------------------+---------+ +| name | dept | age | avg_age | max_age | +|------+-------------+-----+------------------+---------| +| Lisa | Sales | 35 | 31.2222222222222 | 38 | +| Fred | Engineering | 28 | NULL | NULL | +| Paul | Engineering | 23 | NULL | NULL | +| Evan | Sales | 38 | NULL | NULL | +| Chloe| Engineering | 25 | NULL | NULL | +| Tom | Engineering | 33 | NULL | NULL | +| Alex | Sales | 33 | NULL | NULL | +| Jane | Marketing | 28 | NULL | NULL | +| Jeff | Marketing | 38 | NULL | NULL | ++------+-------------+-----+------------------+---------+ +``` + +## Example 4: Override case of column name conflict + +This example demonstrates the override option when column names conflict between main search and sub-search. + +```ppl +source=employees +| stats avg(age) as agg by dept +| appendcol override=true [ stats max(age) as agg by dept ] +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++-----+-------------+ +| agg | dept | +|-----+-------------| +| 38 | Sales | +| 38 | Engineering | +| 38 | Marketing | ++-----+-------------+ +``` + \ No newline at end of file diff --git a/docs/user/ppl/cmd/appendcol.rst b/docs/user/ppl/cmd/appendcol.rst deleted file mode 100644 index a9cb714256b..00000000000 --- a/docs/user/ppl/cmd/appendcol.rst +++ /dev/null @@ -1,110 +0,0 @@ -========= -appendcol -========= - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -============ -The ``appendcol`` command appends the result of a sub-search and attaches it alongside with the input search results (The main search). - -Syntax -====== -appendcol [override=] - -* override=: optional. Boolean field to specify should result from main-result be overwritten in the case of column name conflict. **Default:** false. -* sub-search: mandatory. Executes PPL commands as a secondary search. The sub-search uses the same data specified in the source clause of the main search results as its input. - -Example 1: Append a count aggregation to existing search result -=============================================================== - -This example appends "count by gender" to "sum by gender, state". - -PPL query:: - - PPL> source=accounts | stats sum(age) by gender, state | appendcol [ stats count(age) by gender ] | head 10; - fetched rows / total rows = 10/10 - +--------+-------+----------+------------+ - | gender | state | sum(age) | count(age) | - |--------+-------+----------+------------| - | F | AK | 317 | 493 | - | F | AL | 397 | 507 | - | F | AR | 229 | NULL | - | F | AZ | 238 | NULL | - | F | CA | 282 | NULL | - | F | CO | 217 | NULL | - | F | CT | 147 | NULL | - | F | DC | 358 | NULL | - | F | DE | 101 | NULL | - | F | FL | 310 | NULL | - +--------+-------+----------+------------+ - -Example 2: Append a count aggregation to existing search result with override option -==================================================================================== - -This example appends "count by gender" to "sum by gender, state" with override option. - -PPL query:: - - PPL> source=accounts | stats sum(age) by gender, state | appendcol override=true [ stats count(age) by gender ] | head 10; - fetched rows / total rows = 10/10 - +--------+-------+----------+------------+ - | gender | state | sum(age) | count(age) | - |--------+-------+----------+------------| - | F | AK | 317 | 493 | - | M | AL | 397 | 507 | - | F | AR | 229 | NULL | - | F | AZ | 238 | NULL | - | F | CA | 282 | NULL | - | F | CO | 217 | NULL | - | F | CT | 147 | NULL | - | F | DC | 358 | NULL | - | F | DE | 101 | NULL | - | F | FL | 310 | NULL | - +--------+-------+----------+------------+ - -Example 3: Append multiple sub-search results -============================================= - -This example shows how to chain multiple appendcol commands to add columns from different sub-searches. - -PPL query:: - - PPL> source=employees | fields name, dept, age | appendcol [ stats avg(age) as avg_age ] | appendcol [ stats max(age) as max_age ]; - fetched rows / total rows = 9/9 - +------+-------------+-----+------------------+---------+ - | name | dept | age | avg_age | max_age | - |------+-------------+-----+------------------+---------| - | Lisa | Sales | 35 | 31.2222222222222 | 38 | - | Fred | Engineering | 28 | NULL | NULL | - | Paul | Engineering | 23 | NULL | NULL | - | Evan | Sales | 38 | NULL | NULL | - | Chloe| Engineering | 25 | NULL | NULL | - | Tom | Engineering | 33 | NULL | NULL | - | Alex | Sales | 33 | NULL | NULL | - | Jane | Marketing | 28 | NULL | NULL | - | Jeff | Marketing | 38 | NULL | NULL | - +------+-------------+-----+------------------+---------+ - -Example 4: Override case of column name conflict -================================================ - -This example demonstrates the override option when column names conflict between main search and sub-search. - -PPL query:: - - PPL> source=employees | stats avg(age) as agg by dept | appendcol override=true [ stats max(age) as agg by dept ]; - fetched rows / total rows = 3/3 - +-----+-------------+ - | agg | dept | - |-----+-------------| - | 38 | Sales | - | 38 | Engineering | - | 38 | Marketing | - +-----+-------------+ - diff --git a/docs/user/ppl/cmd/appendpipe.md b/docs/user/ppl/cmd/appendpipe.md new file mode 100644 index 00000000000..f2dc71a2abc --- /dev/null +++ b/docs/user/ppl/cmd/appendpipe.md @@ -0,0 +1,70 @@ +# appendpipe + +## Description + +The `appendpipe` command appends the result of the subpipeline to the search results. Unlike a subsearch, the subpipeline is not run first.The subpipeline is run when the search reaches the appendpipe command. +The command aligns columns with the same field names and types. For different column fields between the main search and sub-search, NULL values are filled in the respective rows. +## Syntax + +appendpipe [\] +* subpipeline: mandatory. A list of commands that are applied to the search results from the commands that occur in the search before the `appendpipe` command. + +## Example 1: Append rows from a total count to existing search result + +This example appends rows from "total by gender" to "sum by gender, state" with merged column of same field name and type. + +```ppl +source=accounts +| stats sum(age) as part by gender, state +| sort -part +| head 5 +| appendpipe [ stats sum(part) as total by gender ] +``` + +Expected output: + +```text +fetched rows / total rows = 6/6 ++------+--------+-------+-------+ +| part | gender | state | total | +|------+--------+-------+-------| +| 36 | M | TN | null | +| 33 | M | MD | null | +| 32 | M | IL | null | +| 28 | F | VA | null | +| null | F | null | 28 | +| null | M | null | 101 | ++------+--------+-------+-------+ +``` + +## Example 2: Append rows with merged column names + +This example appends rows from "count by gender" to "sum by gender, state". + +```ppl +source=accounts +| stats sum(age) as total by gender, state +| sort -total +| head 5 +| appendpipe [ stats sum(total) as total by gender ] +``` + +Expected output: + +```text +fetched rows / total rows = 6/6 ++----------+--------+-------+ +| total | gender | state | +|----------+--------+-------| +| 36 | M | TN | +| 33 | M | MD | +| 32 | M | IL | +| 28 | F | VA | +| 28 | F | null | +| 101 | M | null | ++----------+--------+-------+ +``` + +## Limitations + +* **Schema Compatibility**: Same as command `append`, when fields with the same name exist between the main search and sub-search but have incompatible types, the query will fail with an error. To avoid type conflicts, ensure that fields with the same name have the same data type, or use different field names (e.g., by renaming with `eval` or using `fields` to select non-conflicting columns). \ No newline at end of file diff --git a/docs/user/ppl/cmd/appendpipe.rst b/docs/user/ppl/cmd/appendpipe.rst deleted file mode 100644 index c309517724a..00000000000 --- a/docs/user/ppl/cmd/appendpipe.rst +++ /dev/null @@ -1,68 +0,0 @@ -========= -appendpipe -========= - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -============ -| The ``appendpipe`` command appends the result of the subpipeline to the search results. Unlike a subsearch, the subpipeline is not run first.The subpipeline is run when the search reaches the appendpipe command. -The command aligns columns with the same field names and types. For different column fields between the main search and sub-search, NULL values are filled in the respective rows. - -Syntax -============ -appendpipe [] - -* subpipeline: mandatory. A list of commands that are applied to the search results from the commands that occur in the search before the ``appendpipe`` command. - -Example 1: Append rows from a total count to existing search result -==================================================================================== - -This example appends rows from "total by gender" to "sum by gender, state" with merged column of same field name and type. - -PPL query:: - - os> source=accounts | stats sum(age) as part by gender, state | sort -part | head 5 | appendpipe [ stats sum(part) as total by gender ]; - fetched rows / total rows = 6/6 - +------+--------+-------+-------+ - | part | gender | state | total | - |------+--------+-------+-------| - | 36 | M | TN | null | - | 33 | M | MD | null | - | 32 | M | IL | null | - | 28 | F | VA | null | - | null | F | null | 28 | - | null | M | null | 101 | - +------+--------+-------+-------+ - - - -Example 2: Append rows with merged column names -=============================================================== - -This example appends rows from "count by gender" to "sum by gender, state". - -PPL query:: - - os> source=accounts | stats sum(age) as total by gender, state | sort -total | head 5 | appendpipe [ stats sum(total) as total by gender ]; - fetched rows / total rows = 6/6 - +----------+--------+-------+ - | total | gender | state | - |----------+--------+-------| - | 36 | M | TN | - | 33 | M | MD | - | 32 | M | IL | - | 28 | F | VA | - | 28 | F | null | - | 101 | M | null | - +----------+--------+-------+ - -Limitations -=========== - -* **Schema Compatibility**: Same as command ``append``, when fields with the same name exist between the main search and sub-search but have incompatible types, the query will fail with an error. To avoid type conflicts, ensure that fields with the same name have the same data type, or use different field names (e.g., by renaming with ``eval`` or using ``fields`` to select non-conflicting columns). diff --git a/docs/user/ppl/cmd/bin.md b/docs/user/ppl/cmd/bin.md new file mode 100644 index 00000000000..4fc81d0f0fd --- /dev/null +++ b/docs/user/ppl/cmd/bin.md @@ -0,0 +1,469 @@ +# bin + +## Description + +The `bin` command groups numeric values into buckets of equal intervals, making it useful for creating histograms and analyzing data distribution. It takes a numeric or time-based field and generates a new field with values that represent the lower bound of each bucket. +## Syntax + +bin \ [span=\] [minspan=\] [bins=\] [aligntime=(earliest \| latest \| \)] [start=\] [end=\] +* field: mandatory. The field to bin. Accepts numeric or time-based fields. +* span: optional. The interval size for each bin. Cannot be used with bins or minspan parameters. + * Supports numeric (e.g., `1000`), logarithmic (e.g., `log10`, `2log10`), and time intervals + * Available time units: + * microsecond (us) + * millisecond (ms) + * centisecond (cs) + * decisecond (ds) + * second (s, sec, secs, second, seconds) + * minute (m, min, mins, minute, minutes) + * hour (h, hr, hrs, hour, hours) + * day (d, day, days) + * month (mon, month, months) +* minspan: optional. The minimum interval size for automatic span calculation. Cannot be used with span or bins parameters. +* bins: optional. The maximum number of equal-width bins to create. Cannot be used with span or minspan parameters. The bins parameter must be between 2 and 50000 (inclusive). +* aligntime: optional. Align the bin times for time-based fields. Valid only for time-based discretization. Options: + * earliest: Align bins to the earliest timestamp in the data + * latest: Align bins to the latest timestamp in the data + * \: Align bins to a specific epoch time value or time modifier expression +* start: optional. The starting value for binning range. **Default:** minimum field value. +* end: optional. The ending value for binning range. **Default:** maximum field value. + +**Parameter Behavior** +When multiple parameters are specified, priority order is: span > minspan > bins > start/end > default. +**Special Behaviors:** +* Logarithmic span (`log10`, `2log10`, etc.) creates logarithmic bin boundaries instead of linear +* Daily/monthly spans automatically align to calendar boundaries and return date strings (YYYY-MM-DD) instead of timestamps +* aligntime parameter only applies to time spans excluding days/months +* start/end parameters expand the range (never shrink) and affect bin width calculation + +## Example 1: Basic numeric span + +```ppl +source=accounts +| bin age span=10 +| fields age, account_number +| head 3 +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++-------+----------------+ +| age | account_number | +|-------+----------------| +| 30-40 | 1 | +| 30-40 | 6 | +| 20-30 | 13 | ++-------+----------------+ +``` + +## Example 2: Large numeric span + +```ppl +source=accounts +| bin balance span=25000 +| fields balance +| head 2 +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++-------------+ +| balance | +|-------------| +| 25000-50000 | +| 0-25000 | ++-------------+ +``` + +## Example 3: Logarithmic span (log10) + +```ppl +source=accounts +| bin balance span=log10 +| fields balance +| head 2 +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++------------------+ +| balance | +|------------------| +| 10000.0-100000.0 | +| 1000.0-10000.0 | ++------------------+ +``` + +## Example 4: Logarithmic span with coefficient + +```ppl +source=accounts +| bin balance span=2log10 +| fields balance +| head 3 +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++------------------+ +| balance | +|------------------| +| 20000.0-200000.0 | +| 2000.0-20000.0 | +| 20000.0-200000.0 | ++------------------+ +``` + +## Example 5: Basic bins parameter + +```ppl +source=time_test +| bin value bins=5 +| fields value +| head 3 +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++------------+ +| value | +|------------| +| 8000-9000 | +| 7000-8000 | +| 9000-10000 | ++------------+ +``` + +## Example 6: Low bin count + +```ppl +source=accounts +| bin age bins=2 +| fields age +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------+ +| age | +|-------| +| 30-40 | ++-------+ +``` + +## Example 7: High bin count + +```ppl +source=accounts +| bin age bins=21 +| fields age, account_number +| head 3 +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++-------+----------------+ +| age | account_number | +|-------+----------------| +| 32-33 | 1 | +| 36-37 | 6 | +| 28-29 | 13 | ++-------+----------------+ +``` + +## Example 8: Basic minspan + +```ppl +source=accounts +| bin age minspan=5 +| fields age, account_number +| head 3 +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++-------+----------------+ +| age | account_number | +|-------+----------------| +| 30-40 | 1 | +| 30-40 | 6 | +| 20-30 | 13 | ++-------+----------------+ +``` + +## Example 9: Large minspan + +```ppl +source=accounts +| bin age minspan=101 +| fields age +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------+ +| age | +|--------| +| 0-1000 | ++--------+ +``` + +## Example 10: Start and end range + +```ppl +source=accounts +| bin age start=0 end=101 +| fields age +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------+ +| age | +|-------| +| 0-100 | ++-------+ +``` + +## Example 11: Large end range + +```ppl +source=accounts +| bin balance start=0 end=100001 +| fields balance +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------+ +| balance | +|----------| +| 0-100000 | ++----------+ +``` + +## Example 12: Span with start/end + +```ppl +source=accounts +| bin age span=1 start=25 end=35 +| fields age +| head 6 +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-------+ +| age | +|-------| +| 32-33 | +| 36-37 | +| 28-29 | +| 33-34 | ++-------+ +``` + +## Example 13: Hour span + +```ppl +source=time_test +| bin @timestamp span=1h +| fields @timestamp, value +| head 3 +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++---------------------+-------+ +| @timestamp | value | +|---------------------+-------| +| 2025-07-28 00:00:00 | 8945 | +| 2025-07-28 01:00:00 | 7623 | +| 2025-07-28 02:00:00 | 9187 | ++---------------------+-------+ +``` + +## Example 14: Minute span + +```ppl +source=time_test +| bin @timestamp span=45minute +| fields @timestamp, value +| head 3 +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++---------------------+-------+ +| @timestamp | value | +|---------------------+-------| +| 2025-07-28 00:00:00 | 8945 | +| 2025-07-28 01:30:00 | 7623 | +| 2025-07-28 02:15:00 | 9187 | ++---------------------+-------+ +``` + +## Example 15: Second span + +```ppl +source=time_test +| bin @timestamp span=30seconds +| fields @timestamp, value +| head 3 +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++---------------------+-------+ +| @timestamp | value | +|---------------------+-------| +| 2025-07-28 00:15:30 | 8945 | +| 2025-07-28 01:42:00 | 7623 | +| 2025-07-28 02:28:30 | 9187 | ++---------------------+-------+ +``` + +## Example 16: Daily span + +```ppl +source=time_test +| bin @timestamp span=7day +| fields @timestamp, value +| head 3 +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++---------------------+-------+ +| @timestamp | value | +|---------------------+-------| +| 2025-07-24 00:00:00 | 8945 | +| 2025-07-24 00:00:00 | 7623 | +| 2025-07-24 00:00:00 | 9187 | ++---------------------+-------+ +``` + +## Example 17: Aligntime with time modifier + +```ppl +source=time_test +| bin @timestamp span=2h aligntime='@d+3h' +| fields @timestamp, value +| head 3 +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++---------------------+-------+ +| @timestamp | value | +|---------------------+-------| +| 2025-07-27 23:00:00 | 8945 | +| 2025-07-28 01:00:00 | 7623 | +| 2025-07-28 01:00:00 | 9187 | ++---------------------+-------+ +``` + +## Example 18: Aligntime with epoch timestamp + +```ppl +source=time_test +| bin @timestamp span=2h aligntime=1500000000 +| fields @timestamp, value +| head 3 +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++---------------------+-------+ +| @timestamp | value | +|---------------------+-------| +| 2025-07-27 22:40:00 | 8945 | +| 2025-07-28 00:40:00 | 7623 | +| 2025-07-28 00:40:00 | 9187 | ++---------------------+-------+ +``` + +## Example 19: Default behavior (no parameters) + +```ppl +source=accounts +| bin age +| fields age, account_number +| head 3 +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++-----------+----------------+ +| age | account_number | +|-----------+----------------| +| 32.0-33.0 | 1 | +| 36.0-37.0 | 6 | +| 28.0-29.0 | 13 | ++-----------+----------------+ +``` + +## Example 20: Binning with string fields + +```ppl +source=accounts +| eval age_str = CAST(age AS STRING) +| bin age_str bins=3 +| stats count() by age_str +| sort age_str +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++---------+---------+ +| count() | age_str | +|---------+---------| +| 1 | 20-30 | +| 3 | 30-40 | ++---------+---------+ +``` + \ No newline at end of file diff --git a/docs/user/ppl/cmd/bin.rst b/docs/user/ppl/cmd/bin.rst deleted file mode 100644 index f42fb1da84f..00000000000 --- a/docs/user/ppl/cmd/bin.rst +++ /dev/null @@ -1,348 +0,0 @@ -=== -bin -=== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -============ -| The ``bin`` command groups numeric values into buckets of equal intervals, making it useful for creating histograms and analyzing data distribution. It takes a numeric or time-based field and generates a new field with values that represent the lower bound of each bucket. - -Syntax -====== -bin [span=] [minspan=] [bins=] [aligntime=(earliest | latest | )] [start=] [end=] - -* field: mandatory. The field to bin. Accepts numeric or time-based fields. -* span: optional. The interval size for each bin. Cannot be used with bins or minspan parameters. - - * Supports numeric (e.g., ``1000``), logarithmic (e.g., ``log10``, ``2log10``), and time intervals - * Available time units: - - * microsecond (us) - * millisecond (ms) - * centisecond (cs) - * decisecond (ds) - * second (s, sec, secs, second, seconds) - * minute (m, min, mins, minute, minutes) - * hour (h, hr, hrs, hour, hours) - * day (d, day, days) - * month (mon, month, months) - -* minspan: optional. The minimum interval size for automatic span calculation. Cannot be used with span or bins parameters. -* bins: optional. The maximum number of equal-width bins to create. Cannot be used with span or minspan parameters. The bins parameter must be between 2 and 50000 (inclusive). -* aligntime: optional. Align the bin times for time-based fields. Valid only for time-based discretization. Options: - - * earliest: Align bins to the earliest timestamp in the data - * latest: Align bins to the latest timestamp in the data - * : Align bins to a specific epoch time value or time modifier expression - -* start: optional. The starting value for binning range. **Default:** minimum field value. -* end: optional. The ending value for binning range. **Default:** maximum field value. - -**Parameter Behavior** - -When multiple parameters are specified, priority order is: span > minspan > bins > start/end > default. - -**Special Behaviors:** -* Logarithmic span (``log10``, ``2log10``, etc.) creates logarithmic bin boundaries instead of linear -* Daily/monthly spans automatically align to calendar boundaries and return date strings (YYYY-MM-DD) instead of timestamps -* aligntime parameter only applies to time spans excluding days/months -* start/end parameters expand the range (never shrink) and affect bin width calculation - -Example 1: Basic numeric span -============================= - -PPL query:: - - os> source=accounts | bin age span=10 | fields age, account_number | head 3; - fetched rows / total rows = 3/3 - +-------+----------------+ - | age | account_number | - |-------+----------------| - | 30-40 | 1 | - | 30-40 | 6 | - | 20-30 | 13 | - +-------+----------------+ - -Example 2: Large numeric span -============================== - -PPL query:: - - os> source=accounts | bin balance span=25000 | fields balance | head 2; - fetched rows / total rows = 2/2 - +-------------+ - | balance | - |-------------| - | 25000-50000 | - | 0-25000 | - +-------------+ - - -Example 3: Logarithmic span (log10) -=================================== - -PPL query:: - - os> source=accounts | bin balance span=log10 | fields balance | head 2; - fetched rows / total rows = 2/2 - +------------------+ - | balance | - |------------------| - | 10000.0-100000.0 | - | 1000.0-10000.0 | - +------------------+ - -Example 4: Logarithmic span with coefficient -============================================ - -PPL query:: - - os> source=accounts | bin balance span=2log10 | fields balance | head 3; - fetched rows / total rows = 3/3 - +------------------+ - | balance | - |------------------| - | 20000.0-200000.0 | - | 2000.0-20000.0 | - | 20000.0-200000.0 | - +------------------+ - -Example 5: Basic bins parameter -=============================== - -PPL query:: - - os> source=time_test | bin value bins=5 | fields value | head 3; - fetched rows / total rows = 3/3 - +------------+ - | value | - |------------| - | 8000-9000 | - | 7000-8000 | - | 9000-10000 | - +------------+ - -Example 6: Low bin count -======================== - -PPL query:: - - os> source=accounts | bin age bins=2 | fields age | head 1; - fetched rows / total rows = 1/1 - +-------+ - | age | - |-------| - | 30-40 | - +-------+ - -Example 7: High bin count -========================= - -PPL query:: - - os> source=accounts | bin age bins=21 | fields age, account_number | head 3; - fetched rows / total rows = 3/3 - +-------+----------------+ - | age | account_number | - |-------+----------------| - | 32-33 | 1 | - | 36-37 | 6 | - | 28-29 | 13 | - +-------+----------------+ - -Example 8: Basic minspan -======================== - -PPL query:: - - os> source=accounts | bin age minspan=5 | fields age, account_number | head 3; - fetched rows / total rows = 3/3 - +-------+----------------+ - | age | account_number | - |-------+----------------| - | 30-40 | 1 | - | 30-40 | 6 | - | 20-30 | 13 | - +-------+----------------+ - -Example 9: Large minspan -======================== - -PPL query:: - - os> source=accounts | bin age minspan=101 | fields age | head 1; - fetched rows / total rows = 1/1 - +--------+ - | age | - |--------| - | 0-1000 | - +--------+ - -Example 10: Start and end range -=============================== - -PPL query:: - - os> source=accounts | bin age start=0 end=101 | fields age | head 1; - fetched rows / total rows = 1/1 - +-------+ - | age | - |-------| - | 0-100 | - +-------+ - -Example 11: Large end range -=========================== - -PPL query:: - - os> source=accounts | bin balance start=0 end=100001 | fields balance | head 1; - fetched rows / total rows = 1/1 - +----------+ - | balance | - |----------| - | 0-100000 | - +----------+ - -Example 12: Span with start/end -=============================== - -PPL query:: - - os> source=accounts | bin age span=1 start=25 end=35 | fields age | head 6; - fetched rows / total rows = 4/4 - +-------+ - | age | - |-------| - | 32-33 | - | 36-37 | - | 28-29 | - | 33-34 | - +-------+ - -Example 13: Hour span -===================== - -PPL query:: - - os> source=time_test | bin @timestamp span=1h | fields @timestamp, value | head 3; - fetched rows / total rows = 3/3 - +---------------------+-------+ - | @timestamp | value | - |---------------------+-------| - | 2025-07-28 00:00:00 | 8945 | - | 2025-07-28 01:00:00 | 7623 | - | 2025-07-28 02:00:00 | 9187 | - +---------------------+-------+ - -Example 14: Minute span -======================= - -PPL query:: - - os> source=time_test | bin @timestamp span=45minute | fields @timestamp, value | head 3; - fetched rows / total rows = 3/3 - +---------------------+-------+ - | @timestamp | value | - |---------------------+-------| - | 2025-07-28 00:00:00 | 8945 | - | 2025-07-28 01:30:00 | 7623 | - | 2025-07-28 02:15:00 | 9187 | - +---------------------+-------+ - -Example 15: Second span -======================= - -PPL query:: - - os> source=time_test | bin @timestamp span=30seconds | fields @timestamp, value | head 3; - fetched rows / total rows = 3/3 - +---------------------+-------+ - | @timestamp | value | - |---------------------+-------| - | 2025-07-28 00:15:30 | 8945 | - | 2025-07-28 01:42:00 | 7623 | - | 2025-07-28 02:28:30 | 9187 | - +---------------------+-------+ - -Example 16: Daily span -====================== - -PPL query:: - - os> source=time_test | bin @timestamp span=7day | fields @timestamp, value | head 3; - fetched rows / total rows = 3/3 - +---------------------+-------+ - | @timestamp | value | - |---------------------+-------| - | 2025-07-24 00:00:00 | 8945 | - | 2025-07-24 00:00:00 | 7623 | - | 2025-07-24 00:00:00 | 9187 | - +---------------------+-------+ - -Example 17: Aligntime with time modifier -======================================== - -PPL query:: - - os> source=time_test | bin @timestamp span=2h aligntime='@d+3h' | fields @timestamp, value | head 3; - fetched rows / total rows = 3/3 - +---------------------+-------+ - | @timestamp | value | - |---------------------+-------| - | 2025-07-27 23:00:00 | 8945 | - | 2025-07-28 01:00:00 | 7623 | - | 2025-07-28 01:00:00 | 9187 | - +---------------------+-------+ - -Example 18: Aligntime with epoch timestamp -========================================== - -PPL query:: - - os> source=time_test | bin @timestamp span=2h aligntime=1500000000 | fields @timestamp, value | head 3; - fetched rows / total rows = 3/3 - +---------------------+-------+ - | @timestamp | value | - |---------------------+-------| - | 2025-07-27 22:40:00 | 8945 | - | 2025-07-28 00:40:00 | 7623 | - | 2025-07-28 00:40:00 | 9187 | - +---------------------+-------+ - -Example 19: Default behavior (no parameters) -============================================ - -PPL query:: - - os> source=accounts | bin age | fields age, account_number | head 3; - fetched rows / total rows = 3/3 - +-----------+----------------+ - | age | account_number | - |-----------+----------------| - | 32.0-33.0 | 1 | - | 36.0-37.0 | 6 | - | 28.0-29.0 | 13 | - +-----------+----------------+ - - -Example 20: Binning with string fields -============================================== - -PPL query:: - - os> source=accounts | eval age_str = CAST(age AS STRING) | bin age_str bins=3 | stats count() by age_str | sort age_str; - fetched rows / total rows = 2/2 - +---------+---------+ - | count() | age_str | - |---------+---------| - | 1 | 20-30 | - | 3 | 30-40 | - +---------+---------+ - diff --git a/docs/user/ppl/cmd/chart.md b/docs/user/ppl/cmd/chart.md new file mode 100644 index 00000000000..829afdedb78 --- /dev/null +++ b/docs/user/ppl/cmd/chart.md @@ -0,0 +1,200 @@ +# chart + +## Description + +The `chart` command transforms search results by applying a statistical aggregation function and optionally grouping the data by one or two fields. The results are suitable for visualization as a two-dimension chart when grouping by two fields, where unique values in the second group key can be pivoted to column names. +## Syntax + +chart [limit=(top\|bottom) \] [useother=\] [usenull=\] [nullstr=\] [otherstr=\] \ [ by \ \ ] \| [over \ ] [ by \] +* limit: optional. Specifies the number of categories to display when using column split. Each unique value in the column split field represents a category. **Default:** top10. + * Syntax: `limit=(top|bottom)` or `limit=` (defaults to top) + * When `limit=K` is set, the top or bottom K categories from the column split field are retained; the remaining categories are grouped into an "OTHER" category if `useother` is not set to false. + * Set limit to 0 to show all categories without any limit. + * Use `limit=topK` or `limit=bottomK` to specify whether to retain the top or bottom K column categories. The ranking is based on the sum of aggregated values for each column category. For example, `chart limit=top3 count() by region, product` keeps the 3 products with the highest total counts across all regions. If not specified, top is used by default. + * Only applies when column split is present (by 2 fields or over...by... coexists). +* useother: optional. Controls whether to create an "OTHER" category for categories beyond the limit. **Default:** true + * When set to false, only the top/bottom N categories (based on limit) are shown without an "OTHER" category. + * When set to true, categories beyond the limit are grouped into an "OTHER" category. + * Only applies when using column split and when there are more categories than the limit. +* usenull: optional. Controls whether to group events without a column split (i.e. whose column split is null) into a separate "NULL" category. **Default:** true + * `usenull` only applies to column split. + * Row split should always be non-null value. Documents with null values in row split will be ignored. + * When `usenull=false`, events with a null column split are excluded from results. + * When `usenull=true`, events with a null column split are grouped into a separate "NULL" category. +* nullstr: optional. Specifies the category name for rows that do not contain the column split value. **Default:** "NULL" + * Only applies when `usenull` is set to true. +* otherstr: optional. Specifies the category name for the "OTHER" category. **Default:** "OTHER" + * Only applies when `useother` is set to true and there are values beyond the limit. +* aggregation_function: mandatory. The aggregation function to apply to the data. + * Currently, only a single aggregation function is supported. + * Available functions: aggregation functions supported by the stats command. +* by: optional. Groups the results by either one field (row split) or two fields (row split and column split) + * `limit`, `useother`, and `usenull` apply to the column split + * Results are returned as individual rows for each combination. + * If not specified, the aggregation is performed across all documents. +* over...by...: optional. Alternative syntax for grouping by multiple fields. + * `over by ` groups the results by both fields. + * Using `over` alone on one field is equivalent to `by ` + +## Notes + +* The fields generated by column splitting are converted to strings so that they are compatible with `nullstr` and `otherstr` and can be used as column names once pivoted. +* Documents with null values in fields used by the aggregation function are excluded from aggregation. For example, in `chart avg(balance) over deptno, group`, documents where `balance` is null are excluded from the average calculation. +* The aggregation metric appears as the last column in the result. Result columns are ordered as: [row-split] [column-split] [aggregation-metrics]. + +## Example 1: Basic aggregation without grouping + +This example calculates the average balance across all accounts. + +```ppl +source=accounts +| chart avg(balance) +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------+ +| avg(balance) | +|--------------| +| 20482.25 | ++--------------+ +``` + +## Example 2: Group by single field + +This example calculates the count of accounts grouped by gender. + +```ppl +source=accounts +| chart count() by gender +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++--------+---------+ +| gender | count() | +|--------+---------| +| F | 1 | +| M | 3 | ++--------+---------+ +``` + +## Example 3: Using over and by for multiple field grouping + +This example shows average balance grouped by both gender and age fields. Note that the age column in the result is converted to string type. + +```ppl +source=accounts +| chart avg(balance) over gender by age +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++--------+-----+--------------+ +| gender | age | avg(balance) | +|--------+-----+--------------| +| F | 28 | 32838.0 | +| M | 32 | 39225.0 | +| M | 33 | 4180.0 | +| M | 36 | 5686.0 | ++--------+-----+--------------+ +``` + +## Example 4: Using basic limit functionality + +This example limits the results to show only the top 1 age group. Note that the age column in the result is converted to string type. + +```ppl +source=accounts +| chart limit=1 count() over gender by age +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++--------+-------+---------+ +| gender | age | count() | +|--------+-------+---------| +| F | OTHER | 1 | +| M | 33 | 1 | +| M | OTHER | 2 | ++--------+-------+---------+ +``` + +## Example 5: Using limit with other parameters + +This example shows using limit with useother and custom otherstr parameters. + +```ppl +source=accounts +| chart limit=top1 useother=true otherstr='minor_gender' count() over state by gender +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-------+--------------+---------+ +| state | gender | count() | +|-------+--------------+---------| +| IL | M | 1 | +| MD | M | 1 | +| TN | M | 1 | +| VA | minor_gender | 1 | ++-------+--------------+---------+ +``` + +## Example 6: Using null parameters + +This example shows using limit with usenull and custom nullstr parameters. + +```ppl +source=accounts +| chart usenull=true nullstr='employer not specified' count() over firstname by employer +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------+------------------------+---------+ +| firstname | employer | count() | +|-----------+------------------------+---------| +| Amber | Pyrami | 1 | +| Dale | employer not specified | 1 | +| Hattie | Netagy | 1 | +| Nanette | Quility | 1 | ++-----------+------------------------+---------+ +``` + +## Example 7: Using chart command with span + +This example demonstrates using span for grouping age ranges. + +```ppl +source=accounts +| chart max(balance) by age span=10, gender +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++-----+--------+--------------+ +| age | gender | max(balance) | +|-----+--------+--------------| +| 20 | F | 32838 | +| 30 | M | 39225 | ++-----+--------+--------------+ +``` + +## Limitations + +* Only a single aggregation function is supported per chart command. \ No newline at end of file diff --git a/docs/user/ppl/cmd/chart.rst b/docs/user/ppl/cmd/chart.rst deleted file mode 100644 index 4ffe3e7abef..00000000000 --- a/docs/user/ppl/cmd/chart.rst +++ /dev/null @@ -1,193 +0,0 @@ -===== -chart -===== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== - -The ``chart`` command transforms search results by applying a statistical aggregation function and optionally grouping the data by one or two fields. The results are suitable for visualization as a two-dimension chart when grouping by two fields, where unique values in the second group key can be pivoted to column names. - -Syntax -====== -chart [limit=(top|bottom) ] [useother=] [usenull=] [nullstr=] [otherstr=] [ by ] | [over ] [ by ] - -* limit: optional. Specifies the number of categories to display when using column split. Each unique value in the column split field represents a category. **Default:** top10. - - * Syntax: ``limit=(top|bottom)`` or ``limit=`` (defaults to top) - * When ``limit=K`` is set, the top or bottom K categories from the column split field are retained; the remaining categories are grouped into an "OTHER" category if ``useother`` is not set to false. - * Set limit to 0 to show all categories without any limit. - * Use ``limit=topK`` or ``limit=bottomK`` to specify whether to retain the top or bottom K column categories. The ranking is based on the sum of aggregated values for each column category. For example, ``chart limit=top3 count() by region, product`` keeps the 3 products with the highest total counts across all regions. If not specified, top is used by default. - * Only applies when column split is present (by 2 fields or over...by... coexists). - -* useother: optional. Controls whether to create an "OTHER" category for categories beyond the limit. **Default:** true - - * When set to false, only the top/bottom N categories (based on limit) are shown without an "OTHER" category. - * When set to true, categories beyond the limit are grouped into an "OTHER" category. - * Only applies when using column split and when there are more categories than the limit. - -* usenull: optional. Controls whether to group events without a column split (i.e. whose column split is null) into a separate "NULL" category. **Default:** true - - * ``usenull`` only applies to column split. - * Row split should always be non-null value. Documents with null values in row split will be ignored. - * When ``usenull=false``, events with a null column split are excluded from results. - * When ``usenull=true``, events with a null column split are grouped into a separate "NULL" category. - -* nullstr: optional. Specifies the category name for rows that do not contain the column split value. **Default:** "NULL" - - * Only applies when ``usenull`` is set to true. - -* otherstr: optional. Specifies the category name for the "OTHER" category. **Default:** "OTHER" - - * Only applies when ``useother`` is set to true and there are values beyond the limit. - -* aggregation_function: mandatory. The aggregation function to apply to the data. - - * Currently, only a single aggregation function is supported. - * Available functions: aggregation functions supported by the stats command. - -* by: optional. Groups the results by either one field (row split) or two fields (row split and column split) - - * ``limit``, ``useother``, and ``usenull`` apply to the column split - * Results are returned as individual rows for each combination. - * If not specified, the aggregation is performed across all documents. - -* over...by...: optional. Alternative syntax for grouping by multiple fields. - - * ``over by `` groups the results by both fields. - * Using ``over`` alone on one field is equivalent to ``by `` - -Notes -===== - -* The fields generated by column splitting are converted to strings so that they are compatible with ``nullstr`` and ``otherstr`` and can be used as column names once pivoted. -* Documents with null values in fields used by the aggregation function are excluded from aggregation. For example, in ``chart avg(balance) over deptno, group``, documents where ``balance`` is null are excluded from the average calculation. -* The aggregation metric appears as the last column in the result. Result columns are ordered as: [row-split] [column-split] [aggregation-metrics]. - -Example 1: Basic aggregation without grouping -============================================= - -This example calculates the average balance across all accounts. - -PPL query:: - - os> source=accounts | chart avg(balance) - fetched rows / total rows = 1/1 - +--------------+ - | avg(balance) | - |--------------| - | 20482.25 | - +--------------+ - -Example 2: Group by single field -================================ - -This example calculates the count of accounts grouped by gender. - -PPL query:: - - os> source=accounts | chart count() by gender - fetched rows / total rows = 2/2 - +--------+---------+ - | gender | count() | - |--------+---------| - | F | 1 | - | M | 3 | - +--------+---------+ - -Example 3: Using over and by for multiple field grouping -======================================================== - -This example shows average balance grouped by both gender and age fields. Note that the age column in the result is converted to string type. - -PPL query:: - - os> source=accounts | chart avg(balance) over gender by age - fetched rows / total rows = 4/4 - +--------+-----+--------------+ - | gender | age | avg(balance) | - |--------+-----+--------------| - | F | 28 | 32838.0 | - | M | 32 | 39225.0 | - | M | 33 | 4180.0 | - | M | 36 | 5686.0 | - +--------+-----+--------------+ - -Example 4: Using basic limit functionality -========================================== - -This example limits the results to show only the top 1 age group. Note that the age column in the result is converted to string type. - -PPL query:: - - os> source=accounts | chart limit=1 count() over gender by age - fetched rows / total rows = 3/3 - +--------+-------+---------+ - | gender | age | count() | - |--------+-------+---------| - | F | OTHER | 1 | - | M | 33 | 1 | - | M | OTHER | 2 | - +--------+-------+---------+ - -Example 5: Using limit with other parameters -============================================ - -This example shows using limit with useother and custom otherstr parameters. - -PPL query:: - - os> source=accounts | chart limit=top1 useother=true otherstr='minor_gender' count() over state by gender - fetched rows / total rows = 4/4 - +-------+--------------+---------+ - | state | gender | count() | - |-------+--------------+---------| - | IL | M | 1 | - | MD | M | 1 | - | TN | M | 1 | - | VA | minor_gender | 1 | - +-------+--------------+---------+ - -Example 6: Using null parameters -================================ - -This example shows using limit with usenull and custom nullstr parameters. - -PPL query:: - - os> source=accounts | chart usenull=true nullstr='employer not specified' count() over firstname by employer - fetched rows / total rows = 4/4 - +-----------+------------------------+---------+ - | firstname | employer | count() | - |-----------+------------------------+---------| - | Amber | Pyrami | 1 | - | Dale | employer not specified | 1 | - | Hattie | Netagy | 1 | - | Nanette | Quility | 1 | - +-----------+------------------------+---------+ - -Example 7: Using chart command with span -======================================== - -This example demonstrates using span for grouping age ranges. - -PPL query:: - - os> source=accounts | chart max(balance) by age span=10, gender - fetched rows / total rows = 2/2 - +-----+--------+--------------+ - | age | gender | max(balance) | - |-----+--------+--------------| - | 20 | F | 32838 | - | 30 | M | 39225 | - +-----+--------+--------------+ - -Limitations -=========== -* Only a single aggregation function is supported per chart command. diff --git a/docs/user/ppl/cmd/dedup.md b/docs/user/ppl/cmd/dedup.md new file mode 100644 index 00000000000..59dfcf63ddc --- /dev/null +++ b/docs/user/ppl/cmd/dedup.md @@ -0,0 +1,134 @@ +# dedup + +## Description + +The `dedup` command removes duplicate documents defined by specified fields from the search result. +## Syntax + +dedup [int] \ [keepempty=\] [consecutive=\] +* int: optional. The `dedup` command retains multiple events for each combination when you specify \. The number for \ must be greater than 0. All other duplicates are removed from the results. **Default:** 1 +* keepempty: optional. If set to true, keep the document if the any field in the field-list has NULL value or field is MISSING. **Default:** false. +* consecutive: optional. If set to true, removes only events with duplicate combinations of values that are consecutive. **Default:** false. +* field-list: mandatory. The comma-delimited field list. At least one field is required. + +## Example 1: Dedup by one field + +This example shows deduplicating documents by gender field. + +```ppl +source=accounts +| dedup gender +| fields account_number, gender +| sort account_number +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++----------------+--------+ +| account_number | gender | +|----------------+--------| +| 1 | M | +| 13 | F | ++----------------+--------+ +``` + +## Example 2: Keep 2 duplicates documents + +This example shows deduplicating documents by gender field while keeping 2 duplicates. + +```ppl +source=accounts +| dedup 2 gender +| fields account_number, gender +| sort account_number +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++----------------+--------+ +| account_number | gender | +|----------------+--------| +| 1 | M | +| 6 | M | +| 13 | F | ++----------------+--------+ +``` + +## Example 3: Keep or Ignore the empty field by default + +This example shows deduplicating documents while keeping null values. + +```ppl +source=accounts +| dedup email keepempty=true +| fields account_number, email +| sort account_number +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+-----------------------+ +| account_number | email | +|----------------+-----------------------| +| 1 | amberduke@pyrami.com | +| 6 | hattiebond@netagy.com | +| 13 | null | +| 18 | daleadams@boink.com | ++----------------+-----------------------+ +``` + +This example shows deduplicating documents while ignoring null values. + +```ppl +source=accounts +| dedup email +| fields account_number, email +| sort account_number +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++----------------+-----------------------+ +| account_number | email | +|----------------+-----------------------| +| 1 | amberduke@pyrami.com | +| 6 | hattiebond@netagy.com | +| 18 | daleadams@boink.com | ++----------------+-----------------------+ +``` + +## Example 4: Dedup in consecutive document + +This example shows deduplicating consecutive documents. + +```ppl +source=accounts +| dedup gender consecutive=true +| fields account_number, gender +| sort account_number +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++----------------+--------+ +| account_number | gender | +|----------------+--------| +| 1 | M | +| 13 | F | +| 18 | M | ++----------------+--------+ +``` + +## Limitations + +The `dedup` with `consecutive=true` command can only work with `plugins.calcite.enabled=false`. \ No newline at end of file diff --git a/docs/user/ppl/cmd/dedup.rst b/docs/user/ppl/cmd/dedup.rst deleted file mode 100644 index bc3e9a48ca5..00000000000 --- a/docs/user/ppl/cmd/dedup.rst +++ /dev/null @@ -1,111 +0,0 @@ -===== -dedup -===== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -The ``dedup`` command removes duplicate documents defined by specified fields from the search result. - -Syntax -====== -dedup [int] [keepempty=] [consecutive=] - -* int: optional. The ``dedup`` command retains multiple events for each combination when you specify . The number for must be greater than 0. All other duplicates are removed from the results. **Default:** 1 -* keepempty: optional. If set to true, keep the document if the any field in the field-list has NULL value or field is MISSING. **Default:** false. -* consecutive: optional. If set to true, removes only events with duplicate combinations of values that are consecutive. **Default:** false. -* field-list: mandatory. The comma-delimited field list. At least one field is required. - -Example 1: Dedup by one field -============================= - -This example shows deduplicating documents by gender field. - -PPL query:: - - os> source=accounts | dedup gender | fields account_number, gender | sort account_number; - fetched rows / total rows = 2/2 - +----------------+--------+ - | account_number | gender | - |----------------+--------| - | 1 | M | - | 13 | F | - +----------------+--------+ - -Example 2: Keep 2 duplicates documents -====================================== - -This example shows deduplicating documents by gender field while keeping 2 duplicates. - -PPL query:: - - os> source=accounts | dedup 2 gender | fields account_number, gender | sort account_number; - fetched rows / total rows = 3/3 - +----------------+--------+ - | account_number | gender | - |----------------+--------| - | 1 | M | - | 6 | M | - | 13 | F | - +----------------+--------+ - -Example 3: Keep or Ignore the empty field by default -==================================================== - -This example shows deduplicating documents while keeping null values. - -PPL query:: - - os> source=accounts | dedup email keepempty=true | fields account_number, email | sort account_number; - fetched rows / total rows = 4/4 - +----------------+-----------------------+ - | account_number | email | - |----------------+-----------------------| - | 1 | amberduke@pyrami.com | - | 6 | hattiebond@netagy.com | - | 13 | null | - | 18 | daleadams@boink.com | - +----------------+-----------------------+ - - -This example shows deduplicating documents while ignoring null values. - -PPL query:: - - os> source=accounts | dedup email | fields account_number, email | sort account_number; - fetched rows / total rows = 3/3 - +----------------+-----------------------+ - | account_number | email | - |----------------+-----------------------| - | 1 | amberduke@pyrami.com | - | 6 | hattiebond@netagy.com | - | 18 | daleadams@boink.com | - +----------------+-----------------------+ - - -Example 4: Dedup in consecutive document -======================================== - -This example shows deduplicating consecutive documents. - -PPL query:: - - os> source=accounts | dedup gender consecutive=true | fields account_number, gender | sort account_number; - fetched rows / total rows = 3/3 - +----------------+--------+ - | account_number | gender | - |----------------+--------| - | 1 | M | - | 13 | F | - | 18 | M | - +----------------+--------+ - -Limitations -=========== -The ``dedup`` with ``consecutive=true`` command can only work with ``plugins.calcite.enabled=false``. diff --git a/docs/user/ppl/cmd/describe.md b/docs/user/ppl/cmd/describe.md new file mode 100644 index 00000000000..d6efffc9d58 --- /dev/null +++ b/docs/user/ppl/cmd/describe.md @@ -0,0 +1,67 @@ +# describe + +## Description + +Use the `describe` command to query metadata of the index. `describe` command can only be used as the first command in the PPL query. +## Syntax + +describe [dataSource.][schema.]\ +* dataSource: optional. If dataSource is not provided, it resolves to opensearch dataSource. +* schema: optional. If schema is not provided, it resolves to default schema. +* tablename: mandatory. describe command must specify which tablename to query from. + +## Example 1: Fetch all the metadata + +This example describes the accounts index. + +```ppl +describe accounts +``` + +Expected output: + +```text +fetched rows / total rows = 11/11 ++----------------+-------------+------------+----------------+-----------+-----------+-------------+---------------+----------------+----------------+----------+---------+------------+---------------+------------------+-------------------+------------------+-------------+---------------+--------------+-------------+------------------+------------------+--------------------+ +| TABLE_CAT | TABLE_SCHEM | TABLE_NAME | COLUMN_NAME | DATA_TYPE | TYPE_NAME | COLUMN_SIZE | BUFFER_LENGTH | DECIMAL_DIGITS | NUM_PREC_RADIX | NULLABLE | REMARKS | COLUMN_DEF | SQL_DATA_TYPE | SQL_DATETIME_SUB | CHAR_OCTET_LENGTH | ORDINAL_POSITION | IS_NULLABLE | SCOPE_CATALOG | SCOPE_SCHEMA | SCOPE_TABLE | SOURCE_DATA_TYPE | IS_AUTOINCREMENT | IS_GENERATEDCOLUMN | +|----------------+-------------+------------+----------------+-----------+-----------+-------------+---------------+----------------+----------------+----------+---------+------------+---------------+------------------+-------------------+------------------+-------------+---------------+--------------+-------------+------------------+------------------+--------------------| +| docTestCluster | null | accounts | account_number | null | bigint | null | null | null | 10 | 2 | null | null | null | null | null | 0 | | null | null | null | null | NO | | +| docTestCluster | null | accounts | firstname | null | string | null | null | null | 10 | 2 | null | null | null | null | null | 1 | | null | null | null | null | NO | | +| docTestCluster | null | accounts | address | null | string | null | null | null | 10 | 2 | null | null | null | null | null | 2 | | null | null | null | null | NO | | +| docTestCluster | null | accounts | balance | null | bigint | null | null | null | 10 | 2 | null | null | null | null | null | 3 | | null | null | null | null | NO | | +| docTestCluster | null | accounts | gender | null | string | null | null | null | 10 | 2 | null | null | null | null | null | 4 | | null | null | null | null | NO | | +| docTestCluster | null | accounts | city | null | string | null | null | null | 10 | 2 | null | null | null | null | null | 5 | | null | null | null | null | NO | | +| docTestCluster | null | accounts | employer | null | string | null | null | null | 10 | 2 | null | null | null | null | null | 6 | | null | null | null | null | NO | | +| docTestCluster | null | accounts | state | null | string | null | null | null | 10 | 2 | null | null | null | null | null | 7 | | null | null | null | null | NO | | +| docTestCluster | null | accounts | age | null | bigint | null | null | null | 10 | 2 | null | null | null | null | null | 8 | | null | null | null | null | NO | | +| docTestCluster | null | accounts | email | null | string | null | null | null | 10 | 2 | null | null | null | null | null | 9 | | null | null | null | null | NO | | +| docTestCluster | null | accounts | lastname | null | string | null | null | null | 10 | 2 | null | null | null | null | null | 10 | | null | null | null | null | NO | | ++----------------+-------------+------------+----------------+-----------+-----------+-------------+---------------+----------------+----------------+----------+---------+------------+---------------+------------------+-------------------+------------------+-------------+---------------+--------------+-------------+------------------+------------------+--------------------+ +``` + +## Example 2: Fetch metadata with condition and filter + +This example retrieves columns with type bigint in the accounts index. + +```ppl +describe accounts +| where TYPE_NAME="bigint" +| fields COLUMN_NAME +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++----------------+ +| COLUMN_NAME | +|----------------| +| account_number | +| balance | +| age | ++----------------+ +``` + +## Example 3: Fetch metadata for table in Prometheus datasource + +See [Fetch metadata for table in Prometheus datasource](../admin/datasources.md) for more context. \ No newline at end of file diff --git a/docs/user/ppl/cmd/describe.rst b/docs/user/ppl/cmd/describe.rst deleted file mode 100644 index 2fbb4003414..00000000000 --- a/docs/user/ppl/cmd/describe.rst +++ /dev/null @@ -1,70 +0,0 @@ -======== -describe -======== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -Use the ``describe`` command to query metadata of the index. ``describe`` command can only be used as the first command in the PPL query. - -Syntax -====== -describe [dataSource.][schema.] - -* dataSource: optional. If dataSource is not provided, it resolves to opensearch dataSource. -* schema: optional. If schema is not provided, it resolves to default schema. -* tablename: mandatory. describe command must specify which tablename to query from. - -Example 1: Fetch all the metadata -================================= - -This example describes the accounts index. - -PPL query:: - - os> describe accounts; - fetched rows / total rows = 11/11 - +----------------+-------------+------------+----------------+-----------+-----------+-------------+---------------+----------------+----------------+----------+---------+------------+---------------+------------------+-------------------+------------------+-------------+---------------+--------------+-------------+------------------+------------------+--------------------+ - | TABLE_CAT | TABLE_SCHEM | TABLE_NAME | COLUMN_NAME | DATA_TYPE | TYPE_NAME | COLUMN_SIZE | BUFFER_LENGTH | DECIMAL_DIGITS | NUM_PREC_RADIX | NULLABLE | REMARKS | COLUMN_DEF | SQL_DATA_TYPE | SQL_DATETIME_SUB | CHAR_OCTET_LENGTH | ORDINAL_POSITION | IS_NULLABLE | SCOPE_CATALOG | SCOPE_SCHEMA | SCOPE_TABLE | SOURCE_DATA_TYPE | IS_AUTOINCREMENT | IS_GENERATEDCOLUMN | - |----------------+-------------+------------+----------------+-----------+-----------+-------------+---------------+----------------+----------------+----------+---------+------------+---------------+------------------+-------------------+------------------+-------------+---------------+--------------+-------------+------------------+------------------+--------------------| - | docTestCluster | null | accounts | account_number | null | bigint | null | null | null | 10 | 2 | null | null | null | null | null | 0 | | null | null | null | null | NO | | - | docTestCluster | null | accounts | firstname | null | string | null | null | null | 10 | 2 | null | null | null | null | null | 1 | | null | null | null | null | NO | | - | docTestCluster | null | accounts | address | null | string | null | null | null | 10 | 2 | null | null | null | null | null | 2 | | null | null | null | null | NO | | - | docTestCluster | null | accounts | balance | null | bigint | null | null | null | 10 | 2 | null | null | null | null | null | 3 | | null | null | null | null | NO | | - | docTestCluster | null | accounts | gender | null | string | null | null | null | 10 | 2 | null | null | null | null | null | 4 | | null | null | null | null | NO | | - | docTestCluster | null | accounts | city | null | string | null | null | null | 10 | 2 | null | null | null | null | null | 5 | | null | null | null | null | NO | | - | docTestCluster | null | accounts | employer | null | string | null | null | null | 10 | 2 | null | null | null | null | null | 6 | | null | null | null | null | NO | | - | docTestCluster | null | accounts | state | null | string | null | null | null | 10 | 2 | null | null | null | null | null | 7 | | null | null | null | null | NO | | - | docTestCluster | null | accounts | age | null | bigint | null | null | null | 10 | 2 | null | null | null | null | null | 8 | | null | null | null | null | NO | | - | docTestCluster | null | accounts | email | null | string | null | null | null | 10 | 2 | null | null | null | null | null | 9 | | null | null | null | null | NO | | - | docTestCluster | null | accounts | lastname | null | string | null | null | null | 10 | 2 | null | null | null | null | null | 10 | | null | null | null | null | NO | | - +----------------+-------------+------------+----------------+-----------+-----------+-------------+---------------+----------------+----------------+----------+---------+------------+---------------+------------------+-------------------+------------------+-------------+---------------+--------------+-------------+------------------+------------------+--------------------+ - -Example 2: Fetch metadata with condition and filter -=================================================== - -This example retrieves columns with type bigint in the accounts index. - -PPL query:: - - os> describe accounts | where TYPE_NAME="bigint" | fields COLUMN_NAME; - fetched rows / total rows = 3/3 - +----------------+ - | COLUMN_NAME | - |----------------| - | account_number | - | balance | - | age | - +----------------+ - - -Example 3: Fetch metadata for table in Prometheus datasource -============================================================ - -See `Fetch metadata for table in Prometheus datasource <../admin/datasources.rst>`_ for more context. diff --git a/docs/user/ppl/cmd/eval.md b/docs/user/ppl/cmd/eval.md new file mode 100644 index 00000000000..d3300cd6b01 --- /dev/null +++ b/docs/user/ppl/cmd/eval.md @@ -0,0 +1,132 @@ +# eval + +## Description + +The `eval` command evaluates the expression and appends the result to the search result. +## Syntax + +eval \=\ ["," \=\ ]... +* field: mandatory. If the field name does not exist, a new field is added. If the field name already exists, it will be overridden. +* expression: mandatory. Any expression supported by the system. + +## Example 1: Create a new field + +This example shows creating a new field doubleAge for each document. The new doubleAge field is the result of multiplying age by 2. + +```ppl +source=accounts +| eval doubleAge = age * 2 +| fields age, doubleAge +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----+-----------+ +| age | doubleAge | +|-----+-----------| +| 32 | 64 | +| 36 | 72 | +| 28 | 56 | +| 33 | 66 | ++-----+-----------+ +``` + +## Example 2: Override an existing field + +This example shows overriding the existing age field by adding 1 to it. + +```ppl +source=accounts +| eval age = age + 1 +| fields age +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----+ +| age | +|-----| +| 33 | +| 37 | +| 29 | +| 34 | ++-----+ +``` + +## Example 3: Create a new field with field defined in eval + +This example shows creating a new field ddAge using a field defined in the same eval command. The new field ddAge is the result of multiplying doubleAge by 2, where doubleAge is defined in the same eval command. + +```ppl +source=accounts +| eval doubleAge = age * 2, ddAge = doubleAge * 2 +| fields age, doubleAge, ddAge +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----+-----------+-------+ +| age | doubleAge | ddAge | +|-----+-----------+-------| +| 32 | 64 | 128 | +| 36 | 72 | 144 | +| 28 | 56 | 112 | +| 33 | 66 | 132 | ++-----+-----------+-------+ +``` + +## Example 4: String concatenation + +This example shows using the + operator for string concatenation. You can concatenate string literals and field values. + +```ppl +source=accounts +| eval greeting = 'Hello ' + firstname +| fields firstname, greeting +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------+---------------+ +| firstname | greeting | +|-----------+---------------| +| Amber | Hello Amber | +| Hattie | Hello Hattie | +| Nanette | Hello Nanette | +| Dale | Hello Dale | ++-----------+---------------+ +``` + +## Example 5: Multiple string concatenation with type casting + +This example shows multiple concatenations with type casting from numeric to string. + +```ppl +source=accounts | eval full_info = 'Name: ' + firstname + ', Age: ' + CAST(age AS STRING) | fields firstname, age, full_info +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------+-----+------------------------+ +| firstname | age | full_info | +|-----------+-----+------------------------| +| Amber | 32 | Name: Amber, Age: 32 | +| Hattie | 36 | Name: Hattie, Age: 36 | +| Nanette | 28 | Name: Nanette, Age: 28 | +| Dale | 33 | Name: Dale, Age: 33 | ++-----------+-----+------------------------+ +``` + +## Limitations + +The `eval` command is not rewritten to OpenSearch DSL, it is only executed on the coordination node. \ No newline at end of file diff --git a/docs/user/ppl/cmd/eval.rst b/docs/user/ppl/cmd/eval.rst deleted file mode 100644 index ada7b179526..00000000000 --- a/docs/user/ppl/cmd/eval.rst +++ /dev/null @@ -1,120 +0,0 @@ -==== -eval -==== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -The ``eval`` command evaluates the expression and appends the result to the search result. - -Syntax -====== -eval = ["," = ]... - -* field: mandatory. If the field name does not exist, a new field is added. If the field name already exists, it will be overridden. -* expression: mandatory. Any expression supported by the system. - -Example 1: Create a new field -============================= - -This example shows creating a new field doubleAge for each document. The new doubleAge field is the result of multiplying age by 2. - -PPL query:: - - os> source=accounts | eval doubleAge = age * 2 | fields age, doubleAge ; - fetched rows / total rows = 4/4 - +-----+-----------+ - | age | doubleAge | - |-----+-----------| - | 32 | 64 | - | 36 | 72 | - | 28 | 56 | - | 33 | 66 | - +-----+-----------+ - - -Example 2: Override an existing field -===================================== - -This example shows overriding the existing age field by adding 1 to it. - -PPL query:: - - os> source=accounts | eval age = age + 1 | fields age ; - fetched rows / total rows = 4/4 - +-----+ - | age | - |-----| - | 33 | - | 37 | - | 29 | - | 34 | - +-----+ - -Example 3: Create a new field with field defined in eval -======================================================== - -This example shows creating a new field ddAge using a field defined in the same eval command. The new field ddAge is the result of multiplying doubleAge by 2, where doubleAge is defined in the same eval command. - -PPL query:: - - os> source=accounts | eval doubleAge = age * 2, ddAge = doubleAge * 2 | fields age, doubleAge, ddAge ; - fetched rows / total rows = 4/4 - +-----+-----------+-------+ - | age | doubleAge | ddAge | - |-----+-----------+-------| - | 32 | 64 | 128 | - | 36 | 72 | 144 | - | 28 | 56 | 112 | - | 33 | 66 | 132 | - +-----+-----------+-------+ - -Example 4: String concatenation -=============================== - -This example shows using the + operator for string concatenation. You can concatenate string literals and field values. - -PPL query:: - - source=accounts | eval greeting = 'Hello ' + firstname | fields firstname, greeting - -Expected result:: - - +---------------+---------------------+ - | firstname | greeting | - |---------------+---------------------| - | Amber JOHnny | Hello Amber JOHnny | - | Hattie | Hello Hattie | - | Nanette | Hello Nanette | - | Dale | Hello Dale | - +---------------+---------------------+ - -Example 5: Multiple string concatenation with type casting -========================================================== - -This example shows multiple concatenations with type casting from numeric to string. - -PPL query:: - - source=accounts | eval full_info = 'Name: ' + firstname + ', Age: ' + CAST(age AS STRING) | fields firstname, age, full_info - -Expected result:: - - +---------------+-----+-------------------------------+ - | firstname | age | full_info | - |---------------+-----+-------------------------------| - | Amber JOHnny | 32 | Name: Amber JOHnny, Age: 32 | - | Hattie | 36 | Name: Hattie, Age: 36 | - | Nanette | 28 | Name: Nanette, Age: 28 | - | Dale | 33 | Name: Dale, Age: 33 | - +---------------+-----+-------------------------------+ - -Limitations -=========== -The ``eval`` command is not rewritten to OpenSearch DSL, it is only executed on the coordination node. diff --git a/docs/user/ppl/cmd/eventstats.md b/docs/user/ppl/cmd/eventstats.md new file mode 100644 index 00000000000..1cb791d95e9 --- /dev/null +++ b/docs/user/ppl/cmd/eventstats.md @@ -0,0 +1,166 @@ +# eventstats + +## Description + +The `eventstats` command enriches your event data with calculated summary statistics. It operates by analyzing specified fields within your events, computing various statistical measures, and then appending these results as new fields to each original event. +Key aspects of `eventstats`: +1. It performs calculations across the entire result set or within defined groups. +2. The original events remain intact, with new fields added to contain the statistical results. +3. The command is particularly useful for comparative analysis, identifying outliers, or providing additional context to individual events. + +Difference between `stats` and `eventstats` +The `stats` and `eventstats` commands are both used for calculating statistics, but they have some key differences in how they operate and what they produce: +* Output Format + * `stats`: Produces a summary table with only the calculated statistics. + * `eventstats`: Adds the calculated statistics as new fields to the existing events, preserving the original data. +* Event Retention + * `stats`: Reduces the result set to only the statistical summary, discarding individual events. + * `eventstats`: Retains all original events and adds new fields with the calculated statistics. +* Use Cases + * `stats`: Best for creating summary reports or dashboards. Often used as a final command to summarize results. + * `eventstats`: Useful when you need to enrich events with statistical context for further analysis or filtering. It can be used mid-search to add statistics that can be used in subsequent commands. + +## Syntax + +eventstats [bucket_nullable=bool] \... [by-clause] +* function: mandatory. An aggregation function or window function. +* bucket_nullable: optional. Controls whether the eventstats command consider null buckets as a valid group in group-by aggregations. When set to `false`, it will not treat null group-by values as a distinct group during aggregation. **Default:** Determined by `plugins.ppl.syntax.legacy.preferred`. + * When `plugins.ppl.syntax.legacy.preferred=true`, `bucket_nullable` defaults to `true` + * When `plugins.ppl.syntax.legacy.preferred=false`, `bucket_nullable` defaults to `false` +* by-clause: optional. Groups results by specified fields or expressions. Syntax: by [span-expression,] [field,]... **Default:** aggregation over the entire result set. +* span-expression: optional, at most one. Splits field into buckets by intervals. Syntax: span(field_expr, interval_expr). For example, `span(age, 10)` creates 10-year age buckets, `span(timestamp, 1h)` creates hourly buckets. + * Available time units: + * millisecond (ms) + * second (s) + * minute (m, case sensitive) + * hour (h) + * day (d) + * week (w) + * month (M, case sensitive) + * quarter (q) + * year (y) + +## Aggregation Functions + +The eventstats command supports the following aggregation functions: +* COUNT: Count of values +* SUM: Sum of numeric values +* AVG: Average of numeric values +* MAX: Maximum value +* MIN: Minimum value +* VAR_SAMP: Sample variance +* VAR_POP: Population variance +* STDDEV_SAMP: Sample standard deviation +* STDDEV_POP: Population standard deviation +* DISTINCT_COUNT/DC: Distinct count of values +* EARLIEST: Earliest value by timestamp +* LATEST: Latest value by timestamp + +For detailed documentation of each function, see [Aggregation Functions](../functions/aggregations.md). +## Usage + +Eventstats + +```sql ignore +source = table | eventstats avg(a) +source = table | where a < 50 | eventstats count(c) +source = table | eventstats min(c), max(c) by b +source = table | eventstats count(c) as count_by by b | where count_by > 1000 +source = table | eventstats dc(field) as distinct_count +source = table | eventstats distinct_count(category) by region +``` + +## Example 1: Calculate the average, sum and count of a field by group + +This example shows calculating the average age, sum of age, and count of events for all accounts grouped by gender. + +```ppl +source=accounts +| fields account_number, gender, age +| eventstats avg(age), sum(age), count() by gender +| sort account_number +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+--------+-----+--------------------+----------+---------+ +| account_number | gender | age | avg(age) | sum(age) | count() | +|----------------+--------+-----+--------------------+----------+---------| +| 1 | M | 32 | 33.666666666666664 | 101 | 3 | +| 6 | M | 36 | 33.666666666666664 | 101 | 3 | +| 13 | F | 28 | 28.0 | 28 | 1 | +| 18 | M | 33 | 33.666666666666664 | 101 | 3 | ++----------------+--------+-----+--------------------+----------+---------+ +``` + +## Example 2: Calculate the count by a gender and span + +This example shows counting events by age intervals of 5 years, grouped by gender. + +```ppl +source=accounts +| fields account_number, gender, age +| eventstats count() as cnt by span(age, 5) as age_span, gender +| sort account_number +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+--------+-----+-----+ +| account_number | gender | age | cnt | +|----------------+--------+-----+-----| +| 1 | M | 32 | 2 | +| 6 | M | 36 | 1 | +| 13 | F | 28 | 1 | +| 18 | M | 33 | 2 | ++----------------+--------+-----+-----+ +``` + +## Example 3: Null buckets handling + +```ppl +source=accounts +| eventstats bucket_nullable=false count() as cnt by employer +| fields account_number, firstname, employer, cnt +| sort account_number +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+-----------+----------+------+ +| account_number | firstname | employer | cnt | +|----------------+-----------+----------+------| +| 1 | Amber | Pyrami | 1 | +| 6 | Hattie | Netagy | 1 | +| 13 | Nanette | Quility | 1 | +| 18 | Dale | null | null | ++----------------+-----------+----------+------+ +``` + +```ppl +source=accounts +| eventstats bucket_nullable=true count() as cnt by employer +| fields account_number, firstname, employer, cnt +| sort account_number +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+-----------+----------+-----+ +| account_number | firstname | employer | cnt | +|----------------+-----------+----------+-----| +| 1 | Amber | Pyrami | 1 | +| 6 | Hattie | Netagy | 1 | +| 13 | Nanette | Quility | 1 | +| 18 | Dale | null | 1 | ++----------------+-----------+----------+-----+ +``` + \ No newline at end of file diff --git a/docs/user/ppl/cmd/eventstats.rst b/docs/user/ppl/cmd/eventstats.rst deleted file mode 100644 index cf4ac0d9b02..00000000000 --- a/docs/user/ppl/cmd/eventstats.rst +++ /dev/null @@ -1,162 +0,0 @@ -========== -eventstats -========== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| The ``eventstats`` command enriches your event data with calculated summary statistics. It operates by analyzing specified fields within your events, computing various statistical measures, and then appending these results as new fields to each original event. - -| Key aspects of `eventstats`: - -1. It performs calculations across the entire result set or within defined groups. -2. The original events remain intact, with new fields added to contain the statistical results. -3. The command is particularly useful for comparative analysis, identifying outliers, or providing additional context to individual events. - -| Difference between ``stats`` and ``eventstats`` -The ``stats`` and ``eventstats`` commands are both used for calculating statistics, but they have some key differences in how they operate and what they produce: - -* Output Format - - * ``stats``: Produces a summary table with only the calculated statistics. - * ``eventstats``: Adds the calculated statistics as new fields to the existing events, preserving the original data. - -* Event Retention - - * ``stats``: Reduces the result set to only the statistical summary, discarding individual events. - * ``eventstats``: Retains all original events and adds new fields with the calculated statistics. - -* Use Cases - - * ``stats``: Best for creating summary reports or dashboards. Often used as a final command to summarize results. - * ``eventstats``: Useful when you need to enrich events with statistical context for further analysis or filtering. Can be used mid-search to add statistics that can be used in subsequent commands. - - -Syntax -====== -eventstats [bucket_nullable=bool] ... [by-clause] - -* function: mandatory. An aggregation function or window function. -* bucket_nullable: optional. Controls whether the eventstats command consider null buckets as a valid group in group-by aggregations. When set to ``false``, it will not treat null group-by values as a distinct group during aggregation. **Default:** Determined by ``plugins.ppl.syntax.legacy.preferred``. - - * When ``plugins.ppl.syntax.legacy.preferred=true``, ``bucket_nullable`` defaults to ``true`` - * When ``plugins.ppl.syntax.legacy.preferred=false``, ``bucket_nullable`` defaults to ``false`` - -* by-clause: optional. Groups results by specified fields or expressions. Syntax: by [span-expression,] [field,]... **Default:** aggregation over the entire result set. -* span-expression: optional, at most one. Splits field into buckets by intervals. Syntax: span(field_expr, interval_expr). For example, ``span(age, 10)`` creates 10-year age buckets, ``span(timestamp, 1h)`` creates hourly buckets. - - * Available time units: - - * millisecond (ms) - * second (s) - * minute (m, case sensitive) - * hour (h) - * day (d) - * week (w) - * month (M, case sensitive) - * quarter (q) - * year (y) - -Aggregation Functions -===================== - -The eventstats command supports the following aggregation functions: - -* COUNT: Count of values -* SUM: Sum of numeric values -* AVG: Average of numeric values -* MAX: Maximum value -* MIN: Minimum value -* VAR_SAMP: Sample variance -* VAR_POP: Population variance -* STDDEV_SAMP: Sample standard deviation -* STDDEV_POP: Population standard deviation -* DISTINCT_COUNT/DC: Distinct count of values -* EARLIEST: Earliest value by timestamp -* LATEST: Latest value by timestamp - -For detailed documentation of each function, see `Aggregation Functions <../functions/aggregations.rst>`_. - -Usage -===== - -Eventstats:: - - source = table | eventstats avg(a) - source = table | where a < 50 | eventstats count(c) - source = table | eventstats min(c), max(c) by b - source = table | eventstats count(c) as count_by by b | where count_by > 1000 - source = table | eventstats dc(field) as distinct_count - source = table | eventstats distinct_count(category) by region - - -Example 1: Calculate the average, sum and count of a field by group -=================================================================== - -This example shows calculating the average age, sum of age, and count of events for all accounts grouped by gender. - -PPL query:: - - os> source=accounts | fields account_number, gender, age | eventstats avg(age), sum(age), count() by gender | sort account_number; - fetched rows / total rows = 4/4 - +----------------+--------+-----+--------------------+----------+---------+ - | account_number | gender | age | avg(age) | sum(age) | count() | - |----------------+--------+-----+--------------------+----------+---------| - | 1 | M | 32 | 33.666666666666664 | 101 | 3 | - | 6 | M | 36 | 33.666666666666664 | 101 | 3 | - | 13 | F | 28 | 28.0 | 28 | 1 | - | 18 | M | 33 | 33.666666666666664 | 101 | 3 | - +----------------+--------+-----+--------------------+----------+---------+ - -Example 2: Calculate the count by a gender and span -=================================================== - -This example shows counting events by age intervals of 5 years, grouped by gender. - -PPL query:: - - os> source=accounts | fields account_number, gender, age | eventstats count() as cnt by span(age, 5) as age_span, gender | sort account_number; - fetched rows / total rows = 4/4 - +----------------+--------+-----+-----+ - | account_number | gender | age | cnt | - |----------------+--------+-----+-----| - | 1 | M | 32 | 2 | - | 6 | M | 36 | 1 | - | 13 | F | 28 | 1 | - | 18 | M | 33 | 2 | - +----------------+--------+-----+-----+ - -Example 3: Null buckets handling -================================ - -PPL query:: - - os> source=accounts | eventstats bucket_nullable=false count() as cnt by employer | fields account_number, firstname, employer, cnt | sort account_number; - fetched rows / total rows = 4/4 - +----------------+-----------+----------+------+ - | account_number | firstname | employer | cnt | - |----------------+-----------+----------+------| - | 1 | Amber | Pyrami | 1 | - | 6 | Hattie | Netagy | 1 | - | 13 | Nanette | Quility | 1 | - | 18 | Dale | null | null | - +----------------+-----------+----------+------+ - -PPL query:: - - os> source=accounts | eventstats bucket_nullable=true count() as cnt by employer | fields account_number, firstname, employer, cnt | sort account_number; - fetched rows / total rows = 4/4 - +----------------+-----------+----------+-----+ - | account_number | firstname | employer | cnt | - |----------------+-----------+----------+-----| - | 1 | Amber | Pyrami | 1 | - | 6 | Hattie | Netagy | 1 | - | 13 | Nanette | Quility | 1 | - | 18 | Dale | null | 1 | - +----------------+-----------+----------+-----+ diff --git a/docs/user/ppl/cmd/expand.md b/docs/user/ppl/cmd/expand.md new file mode 100644 index 00000000000..8fddbea7ad7 --- /dev/null +++ b/docs/user/ppl/cmd/expand.md @@ -0,0 +1,50 @@ +# expand + +## Description + +The `expand` command transforms a single document with a nested array field into multiple documents—each containing one element from the array. All other fields in the original document are duplicated across the resulting documents. +Key aspects of `expand`: +* It generates one row per element in the specified array field. +* The specified array field is converted into individual rows. +* If an alias is provided, the expanded values appear under the alias instead of the original field name. +* If the specified field is an empty array, the row is retained with the expanded field set to null. + +## Syntax + +expand \ [as alias] +* field: mandatory. The field to be expanded (exploded). Currently only nested arrays are supported. +* alias: optional. The name to use instead of the original field name. + +## Example 1: Expand address field with an alias + +Given a dataset `migration` with the following data: + +```text +{"name":"abbas","age":24,"address":[{"city":"New york city","state":"NY","moveInDate":{"dateAndTime":"19840412T090742.000Z"}}]} +{"name":"chen","age":32,"address":[{"city":"Miami","state":"Florida","moveInDate":{"dateAndTime":"19010811T040333.000Z"}},{"city":"los angeles","state":"CA","moveInDate":{"dateAndTime":"20230503T080742.000Z"}}]} + +``` + +The following query expand the address field and rename it to addr: + +```ppl +source=migration +| expand address as addr +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++-------+-----+-------------------------------------------------------------------------------------------+ +| name | age | addr | +|-------+-----+-------------------------------------------------------------------------------------------| +| abbas | 24 | {"city":"New york city","state":"NY","moveInDate":{"dateAndTime":"19840412T090742.000Z"}} | +| chen | 32 | {"city":"Miami","state":"Florida","moveInDate":{"dateAndTime":"19010811T040333.000Z"}} | +| chen | 32 | {"city":"los angeles","state":"CA","moveInDate":{"dateAndTime":"20230503T080742.000Z"}} | ++-------+-----+-------------------------------------------------------------------------------------------+ +``` + +## Limitations + +* The `expand` command currently only supports nested arrays. Primitive fields storing arrays are not supported. E.g. a string field storing an array of strings cannot be expanded with the current implementation. \ No newline at end of file diff --git a/docs/user/ppl/cmd/expand.rst b/docs/user/ppl/cmd/expand.rst deleted file mode 100644 index c8065a2da0f..00000000000 --- a/docs/user/ppl/cmd/expand.rst +++ /dev/null @@ -1,61 +0,0 @@ -====== -expand -====== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| The ``expand`` command transforms a single document with a nested array field into multiple documents—each containing one element from the array. All other fields in the original document are duplicated across the resulting documents. - -| Key aspects of ``expand``: - -* It generates one row per element in the specified array field. -* The specified array field is converted into individual rows. -* If an alias is provided, the expanded values appear under the alias instead of the original field name. -* If the specified field is an empty array, the row is retained with the expanded field set to null. - -Syntax -====== - -expand [as alias] - -* field: mandatory. The field to be expanded (exploded). Currently only nested arrays are supported. -* alias: optional. The name to use instead of the original field name. - - -Example 1: Expand address field with an alias -============================================= - -Given a dataset ``migration`` with the following data: - -.. code-block:: - - {"name":"abbas","age":24,"address":[{"city":"New york city","state":"NY","moveInDate":{"dateAndTime":"19840412T090742.000Z"}}]} - {"name":"chen","age":32,"address":[{"city":"Miami","state":"Florida","moveInDate":{"dateAndTime":"19010811T040333.000Z"}},{"city":"los angeles","state":"CA","moveInDate":{"dateAndTime":"20230503T080742.000Z"}}]} - -The following query expand the address field and rename it to addr: - -PPL query:: - - PPL> source=migration | expand address as addr; - fetched rows / total rows = 3/3 - +-------+-----+-------------------------------------------------------------------------------------------+ - | name | age | addr | - |-------+-----+-------------------------------------------------------------------------------------------| - | abbas | 24 | {"city":"New york city","state":"NY","moveInDate":{"dateAndTime":"19840412T090742.000Z"}} | - | chen | 32 | {"city":"Miami","state":"Florida","moveInDate":{"dateAndTime":"19010811T040333.000Z"}} | - | chen | 32 | {"city":"los angeles","state":"CA","moveInDate":{"dateAndTime":"20230503T080742.000Z"}} | - +-------+-----+-------------------------------------------------------------------------------------------+ - -Limitations -=========== - -* The ``expand`` command currently only supports nested arrays. Primitive - fields storing arrays are not supported. E.g. a string field storing an array - of strings cannot be expanded with the current implementation. diff --git a/docs/user/ppl/cmd/explain.md b/docs/user/ppl/cmd/explain.md new file mode 100644 index 00000000000..fb60a3b1207 --- /dev/null +++ b/docs/user/ppl/cmd/explain.md @@ -0,0 +1,181 @@ +# explain + +## Description + +The `explain` command explains the plan of query which is often used for query translation and troubleshooting. The `explain` command can only be used as the first command in the PPL query. +## Syntax + +explain queryStatement +* mode: optional. There are 4 explain modes: "simple", "standard", "cost", "extended". **Default:** standard. + * standard: The default mode. Display logical and physical plan with pushdown information (DSL). + * simple: Display the logical plan tree without attributes. + * cost: Display the standard information plus plan cost attributes. + * extended: Display the standard information plus generated code. +* queryStatement: mandatory. A PPL query to explain. + +## Example 1: Explain a PPL query in v2 engine + +When Calcite is disabled (plugins.calcite.enabled=false), explaining a PPL query will get its physical plan of v2 engine and pushdown information. + +```ppl +explain source=state_country +| where country = 'USA' OR country = 'England' +| stats count() by country +``` + +Explain: + +```json +{ + "root": { + "name": "ProjectOperator", + "description": { + "fields": "[count(), country]" + }, + "children": [ + { + "name": "OpenSearchIndexScan", + "description": { + "request": """OpenSearchQueryRequest(indexName=state_country, sourceBuilder={"from":0,"size":10000,"timeout":"1m","query":{"bool":{"should":[{"term":{"country":{"value":"USA","boost":1.0}}},{"term":{"country":{"value":"England","boost":1.0}}}],"adjust_pure_negative":true,"boost":1.0}},"aggregations":{"composite_buckets":{"composite":{"size":1000,"sources":[{"country":{"terms":{"field":"country","missing_bucket":true,"missing_order":"first","order":"asc"}}}]},"aggregations":{"count()":{"value_count":{"field":"_index"}}}}}}, pitId=null, cursorKeepAlive=null, searchAfter=null, searchResponse=null)""" + }, + "children": [] + } + ] + } +} +``` + +## Example 2: Explain a PPL query in v3 engine + +When Calcite is enabled (plugins.calcite.enabled=true), explaining a PPL query will get its logical and physical plan of v3 engine and pushdown information. + +```ppl +explain source=state_country +| where country = 'USA' OR country = 'England' +| stats count() by country +``` + +Explain + +```json +{ + "calcite": { + "logical": """LogicalProject(count()=[$1], country=[$0]) + LogicalAggregate(group=[{1}], count()=[COUNT()]) + LogicalFilter(condition=[SEARCH($1, Sarg['England', 'USA':CHAR(7)]:CHAR(7))]) + CalciteLogicalIndexScan(table=[[OpenSearch, state_country]]) +""", + "physical": """EnumerableCalc(expr#0..1=[{inputs}], count()=[$t1], country=[$t0]) + CalciteEnumerableIndexScan(table=[[OpenSearch, state_country]], PushDownContext=[[FILTER->SEARCH($1, Sarg['England', 'USA':CHAR(7)]:CHAR(7)), AGGREGATION->rel#53:LogicalAggregate.NONE.[](input=RelSubset#43,group={1},count()=COUNT())], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":0,"timeout":"1m","query":{"terms":{"country":["England","USA"],"boost":1.0}},"aggregations":{"composite_buckets":{"composite":{"size":1000,"sources":[{"country":{"terms":{"field":"country","missing_bucket":true,"missing_order":"first","order":"asc"}}}]},"aggregations":{"count()":{"value_count":{"field":"_index"}}}}}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) +""" + } +} +``` + +## Example 3: Explain a PPL query with simple mode + +When Calcite is enabled (plugins.calcite.enabled=true), you can explain a PPL query with the "simple" mode. + +```ppl +explain simple source=state_country +| where country = 'USA' OR country = 'England' +| stats count() by country +``` + +Explain + +``` +{ + "calcite": { + "logical": """LogicalProject + LogicalAggregate + LogicalFilter + CalciteLogicalIndexScan +""" + } +} +``` + +## Example 4: Explain a PPL query with cost mode + +When Calcite is enabled (plugins.calcite.enabled=true), you can explain a PPL query with the "cost" mode. + +```ppl +explain cost source=state_country +| where country = 'USA' OR country = 'England' +| stats count() by country +``` + +Explain + +```json +{ + "calcite": { + "logical": """LogicalProject(count()=[$1], country=[$0]): rowcount = 2.5, cumulative cost = {130.3125 rows, 206.0 cpu, 0.0 io}, id = 75 + LogicalAggregate(group=[{1}], count()=[COUNT()]): rowcount = 2.5, cumulative cost = {127.8125 rows, 201.0 cpu, 0.0 io}, id = 74 + LogicalFilter(condition=[SEARCH($1, Sarg['England', 'USA':CHAR(7)]:CHAR(7))]): rowcount = 25.0, cumulative cost = {125.0 rows, 201.0 cpu, 0.0 io}, id = 73 + CalciteLogicalIndexScan(table=[[OpenSearch, state_country]]): rowcount = 100.0, cumulative cost = {100.0 rows, 101.0 cpu, 0.0 io}, id = 72 +""", + "physical": """EnumerableCalc(expr#0..1=[{inputs}], count()=[$t1], country=[$t0]): rowcount = 100.0, cumulative cost = {200.0 rows, 501.0 cpu, 0.0 io}, id = 138 + CalciteEnumerableIndexScan(table=[[OpenSearch, state_country]], PushDownContext=[[FILTER->SEARCH($1, Sarg['England', 'USA':CHAR(7)]:CHAR(7)), AGGREGATION->rel#125:LogicalAggregate.NONE.[](input=RelSubset#115,group={1},count()=COUNT())], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":0,"timeout":"1m","query":{"terms":{"country":["England","USA"],"boost":1.0}},"aggregations":{"composite_buckets":{"composite":{"size":1000,"sources":[{"country":{"terms":{"field":"country","missing_bucket":true,"missing_order":"first","order":"asc"}}}]},"aggregations":{"count()":{"value_count":{"field":"_index"}}}}}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]): rowcount = 100.0, cumulative cost = {100.0 rows, 101.0 cpu, 0.0 io}, id = 133 +""" + } +} +``` + +## Example 5: Explain a PPL query with extended mode + +```ppl +explain extended source=state_country +| where country = 'USA' OR country = 'England' +| stats count() by country +``` + +Explain + +```json +{ + "calcite": { + "logical": """LogicalProject(count()=[$1], country=[$0]) + LogicalAggregate(group=[{1}], count()=[COUNT()]) + LogicalFilter(condition=[SEARCH($1, Sarg['England', 'USA':CHAR(7)]:CHAR(7))]) + CalciteLogicalIndexScan(table=[[OpenSearch, state_country]]) +""", + "physical": """EnumerableCalc(expr#0..1=[{inputs}], count()=[$t1], country=[$t0]) + CalciteEnumerableIndexScan(table=[[OpenSearch, state_country]], PushDownContext=[[FILTER->SEARCH($1, Sarg['England', 'USA':CHAR(7)]:CHAR(7)), AGGREGATION->rel#193:LogicalAggregate.NONE.[](input=RelSubset#183,group={1},count()=COUNT())], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":0,"timeout":"1m","query":{"terms":{"country":["England","USA"],"boost":1.0}},"aggregations":{"composite_buckets":{"composite":{"size":1000,"sources":[{"country":{"terms":{"field":"country","missing_bucket":true,"missing_order":"first","order":"asc"}}}]},"aggregations":{"count()":{"value_count":{"field":"_index"}}}}}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) +""", + "extended": """public org.apache.calcite.linq4j.Enumerable bind(final org.apache.calcite.DataContext root) { + final org.opensearch.sql.opensearch.storage.scan.CalciteEnumerableIndexScan v1stashed = (org.opensearch.sql.opensearch.storage.scan.CalciteEnumerableIndexScan) root.get("v1stashed"); + final org.apache.calcite.linq4j.Enumerable _inputEnumerable = v1stashed.scan(); + return new org.apache.calcite.linq4j.AbstractEnumerable(){ + public org.apache.calcite.linq4j.Enumerator enumerator() { + return new org.apache.calcite.linq4j.Enumerator(){ + public final org.apache.calcite.linq4j.Enumerator inputEnumerator = _inputEnumerable.enumerator(); + public void reset() { + inputEnumerator.reset(); + } + public boolean moveNext() { + return inputEnumerator.moveNext(); + } + public void close() { + inputEnumerator.close(); + } + public Object current() { + final Object[] current = (Object[]) inputEnumerator.current(); + final Object input_value = current[1]; + final Object input_value0 = current[0]; + return new Object[] { + input_value, + input_value0}; + } + }; + } + }; +} +public Class getElementType() { + return java.lang.Object[].class; +} +""" + } +} +``` \ No newline at end of file diff --git a/docs/user/ppl/cmd/explain.rst b/docs/user/ppl/cmd/explain.rst deleted file mode 100644 index fb14dfd39f9..00000000000 --- a/docs/user/ppl/cmd/explain.rst +++ /dev/null @@ -1,190 +0,0 @@ -======= -explain -======= - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -The ``explain`` command explains the plan of query which is often used for query translation and troubleshooting. The ``explain`` command can only be used as the first command in the PPL query. - -Syntax -====== -explain queryStatement - -* mode: optional. There are 4 explain modes: "simple", "standard", "cost", "extended". **Default:** standard. - - * standard: The default mode. Display logical and physical plan with pushdown information (DSL). - * simple: Display the logical plan tree without attributes. - * cost: Display the standard information plus plan cost attributes. - * extended: Display the standard information plus generated code. - -* queryStatement: mandatory. A PPL query to explain. - - - -Example 1: Explain a PPL query in v2 engine -=========================================== -When Calcite is disabled (plugins.calcite.enabled=false), explaining a PPL query will get its physical plan of v2 engine and pushdown information. - -PPL query:: - - PPL> explain source=state_country | where country = 'USA' OR country = 'England' | stats count() by country - -Explain:: - - { - "root": { - "name": "ProjectOperator", - "description": { - "fields": "[count(), country]" - }, - "children": [ - { - "name": "OpenSearchIndexScan", - "description": { - "request": """OpenSearchQueryRequest(indexName=state_country, sourceBuilder={"from":0,"size":10000,"timeout":"1m","query":{"bool":{"should":[{"term":{"country":{"value":"USA","boost":1.0}}},{"term":{"country":{"value":"England","boost":1.0}}}],"adjust_pure_negative":true,"boost":1.0}},"aggregations":{"composite_buckets":{"composite":{"size":1000,"sources":[{"country":{"terms":{"field":"country","missing_bucket":true,"missing_order":"first","order":"asc"}}}]},"aggregations":{"count()":{"value_count":{"field":"_index"}}}}}}, pitId=null, cursorKeepAlive=null, searchAfter=null, searchResponse=null)""" - }, - "children": [] - } - ] - } - } - -Example 2: Explain a PPL query in v3 engine -=========================================== - -When Calcite is enabled (plugins.calcite.enabled=true), explaining a PPL query will get its logical and physical plan of v3 engine and pushdown information. - -PPL query:: - - PPL> explain source=state_country | where country = 'USA' OR country = 'England' | stats count() by country - -Explain:: - - { - "calcite": { - "logical": """LogicalProject(count()=[$1], country=[$0]) - LogicalAggregate(group=[{1}], count()=[COUNT()]) - LogicalFilter(condition=[SEARCH($1, Sarg['England', 'USA':CHAR(7)]:CHAR(7))]) - CalciteLogicalIndexScan(table=[[OpenSearch, state_country]]) - """, - "physical": """EnumerableCalc(expr#0..1=[{inputs}], count()=[$t1], country=[$t0]) - CalciteEnumerableIndexScan(table=[[OpenSearch, state_country]], PushDownContext=[[FILTER->SEARCH($1, Sarg['England', 'USA':CHAR(7)]:CHAR(7)), AGGREGATION->rel#53:LogicalAggregate.NONE.[](input=RelSubset#43,group={1},count()=COUNT())], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":0,"timeout":"1m","query":{"terms":{"country":["England","USA"],"boost":1.0}},"aggregations":{"composite_buckets":{"composite":{"size":1000,"sources":[{"country":{"terms":{"field":"country","missing_bucket":true,"missing_order":"first","order":"asc"}}}]},"aggregations":{"count()":{"value_count":{"field":"_index"}}}}}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) - """ - } - } - - -Example 3: Explain a PPL query with simple mode -=============================================== - -When Calcite is enabled (plugins.calcite.enabled=true), you can explain a PPL query with the "simple" mode. - -PPL query:: - - PPL> explain simple source=state_country | where country = 'USA' OR country = 'England' | stats count() by country - -Explain:: - - { - "calcite": { - "logical": """LogicalProject - LogicalAggregate - LogicalFilter - CalciteLogicalIndexScan - """ - } - } - -Example 4: Explain a PPL query with cost mode -============================================= - -When Calcite is enabled (plugins.calcite.enabled=true), you can explain a PPL query with the "cost" mode. - -PPL query:: - - PPL> explain cost source=state_country | where country = 'USA' OR country = 'England' | stats count() by country - -Explain:: - - { - "calcite": { - "logical": """LogicalProject(count()=[$1], country=[$0]): rowcount = 2.5, cumulative cost = {130.3125 rows, 206.0 cpu, 0.0 io}, id = 75 - LogicalAggregate(group=[{1}], count()=[COUNT()]): rowcount = 2.5, cumulative cost = {127.8125 rows, 201.0 cpu, 0.0 io}, id = 74 - LogicalFilter(condition=[SEARCH($1, Sarg['England', 'USA':CHAR(7)]:CHAR(7))]): rowcount = 25.0, cumulative cost = {125.0 rows, 201.0 cpu, 0.0 io}, id = 73 - CalciteLogicalIndexScan(table=[[OpenSearch, state_country]]): rowcount = 100.0, cumulative cost = {100.0 rows, 101.0 cpu, 0.0 io}, id = 72 - """, - "physical": """EnumerableCalc(expr#0..1=[{inputs}], count()=[$t1], country=[$t0]): rowcount = 100.0, cumulative cost = {200.0 rows, 501.0 cpu, 0.0 io}, id = 138 - CalciteEnumerableIndexScan(table=[[OpenSearch, state_country]], PushDownContext=[[FILTER->SEARCH($1, Sarg['England', 'USA':CHAR(7)]:CHAR(7)), AGGREGATION->rel#125:LogicalAggregate.NONE.[](input=RelSubset#115,group={1},count()=COUNT())], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":0,"timeout":"1m","query":{"terms":{"country":["England","USA"],"boost":1.0}},"aggregations":{"composite_buckets":{"composite":{"size":1000,"sources":[{"country":{"terms":{"field":"country","missing_bucket":true,"missing_order":"first","order":"asc"}}}]},"aggregations":{"count()":{"value_count":{"field":"_index"}}}}}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]): rowcount = 100.0, cumulative cost = {100.0 rows, 101.0 cpu, 0.0 io}, id = 133 - """ - } - } - -Example 5: Explain a PPL query with extended mode -================================================= - -PPL query:: - - PPL> explain extended source=state_country | where country = 'USA' OR country = 'England' | stats count() by country - -Explain:: - - { - "calcite": { - "logical": """LogicalProject(count()=[$1], country=[$0]) - LogicalAggregate(group=[{1}], count()=[COUNT()]) - LogicalFilter(condition=[SEARCH($1, Sarg['England', 'USA':CHAR(7)]:CHAR(7))]) - CalciteLogicalIndexScan(table=[[OpenSearch, state_country]]) - """, - "physical": """EnumerableCalc(expr#0..1=[{inputs}], count()=[$t1], country=[$t0]) - CalciteEnumerableIndexScan(table=[[OpenSearch, state_country]], PushDownContext=[[FILTER->SEARCH($1, Sarg['England', 'USA':CHAR(7)]:CHAR(7)), AGGREGATION->rel#193:LogicalAggregate.NONE.[](input=RelSubset#183,group={1},count()=COUNT())], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":0,"timeout":"1m","query":{"terms":{"country":["England","USA"],"boost":1.0}},"aggregations":{"composite_buckets":{"composite":{"size":1000,"sources":[{"country":{"terms":{"field":"country","missing_bucket":true,"missing_order":"first","order":"asc"}}}]},"aggregations":{"count()":{"value_count":{"field":"_index"}}}}}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) - """, - "extended": """public org.apache.calcite.linq4j.Enumerable bind(final org.apache.calcite.DataContext root) { - final org.opensearch.sql.opensearch.storage.scan.CalciteEnumerableIndexScan v1stashed = (org.opensearch.sql.opensearch.storage.scan.CalciteEnumerableIndexScan) root.get("v1stashed"); - final org.apache.calcite.linq4j.Enumerable _inputEnumerable = v1stashed.scan(); - return new org.apache.calcite.linq4j.AbstractEnumerable(){ - public org.apache.calcite.linq4j.Enumerator enumerator() { - return new org.apache.calcite.linq4j.Enumerator(){ - public final org.apache.calcite.linq4j.Enumerator inputEnumerator = _inputEnumerable.enumerator(); - public void reset() { - inputEnumerator.reset(); - } - - public boolean moveNext() { - return inputEnumerator.moveNext(); - } - - public void close() { - inputEnumerator.close(); - } - - public Object current() { - final Object[] current = (Object[]) inputEnumerator.current(); - final Object input_value = current[1]; - final Object input_value0 = current[0]; - return new Object[] { - input_value, - input_value0}; - } - - }; - } - - }; - } - - - public Class getElementType() { - return java.lang.Object[].class; - } - - - """ - } - } diff --git a/docs/user/ppl/cmd/fields.md b/docs/user/ppl/cmd/fields.md new file mode 100644 index 00000000000..507a8e6903f --- /dev/null +++ b/docs/user/ppl/cmd/fields.md @@ -0,0 +1,244 @@ +# fields + +## Description + +The `fields` command keeps or removes fields from the search result. +## Syntax + +fields [+\|-] \ +* +\|-: optional. If the plus (+) is used, only the fields specified in the field list will be kept. If the minus (-) is used, all the fields specified in the field list will be removed. **Default:** +. +* field-list: mandatory. Comma-delimited or space-delimited list of fields to keep or remove. Supports wildcard patterns. + +## Example 1: Select specified fields from result + +This example shows selecting account_number, firstname and lastname fields from search results. + +```ppl +source=accounts +| fields account_number, firstname, lastname +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+-----------+----------+ +| account_number | firstname | lastname | +|----------------+-----------+----------| +| 1 | Amber | Duke | +| 6 | Hattie | Bond | +| 13 | Nanette | Bates | +| 18 | Dale | Adams | ++----------------+-----------+----------+ +``` + +## Example 2: Remove specified fields from result + +This example shows removing the account_number field from search results. + +```ppl +source=accounts +| fields account_number, firstname, lastname +| fields - account_number +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------+----------+ +| firstname | lastname | +|-----------+----------| +| Amber | Duke | +| Hattie | Bond | +| Nanette | Bates | +| Dale | Adams | ++-----------+----------+ +``` + +## Example 3: Space-delimited field selection + +Fields can be specified using spaces instead of commas, providing a more concise syntax. +**Syntax**: `fields field1 field2 field3` + +```ppl +source=accounts +| fields firstname lastname age +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------+----------+-----+ +| firstname | lastname | age | +|-----------+----------+-----| +| Amber | Duke | 32 | +| Hattie | Bond | 36 | +| Nanette | Bates | 28 | +| Dale | Adams | 33 | ++-----------+----------+-----+ +``` + +## Example 4: Prefix wildcard pattern + +Select fields starting with a pattern using prefix wildcards. + +```ppl +source=accounts +| fields account* +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+ +| account_number | +|----------------| +| 1 | +| 6 | +| 13 | +| 18 | ++----------------+ +``` + +## Example 5: Suffix wildcard pattern + +Select fields ending with a pattern using suffix wildcards. + +```ppl +source=accounts +| fields *name +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------+----------+ +| firstname | lastname | +|-----------+----------| +| Amber | Duke | +| Hattie | Bond | +| Nanette | Bates | +| Dale | Adams | ++-----------+----------+ +``` + +## Example 6: Contains wildcard pattern + +Select fields containing a pattern using contains wildcards. + +```ppl +source=accounts +| fields *a* +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------+-----------+-----------------+---------+-------+-----+----------------------+----------+ +| account_number | firstname | address | balance | state | age | email | lastname | +|----------------+-----------+-----------------+---------+-------+-----+----------------------+----------| +| 1 | Amber | 880 Holmes Lane | 39225 | IL | 32 | amberduke@pyrami.com | Duke | ++----------------+-----------+-----------------+---------+-------+-----+----------------------+----------+ +``` + +## Example 7: Mixed delimiter syntax + +Combine spaces and commas for flexible field specification. + +```ppl +source=accounts +| fields firstname, account* *name +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------+----------------+----------+ +| firstname | account_number | lastname | +|-----------+----------------+----------| +| Amber | 1 | Duke | +| Hattie | 6 | Bond | +| Nanette | 13 | Bates | +| Dale | 18 | Adams | ++-----------+----------------+----------+ +``` + +## Example 8: Field deduplication + +Automatically prevents duplicate columns when wildcards expand to already specified fields. + +```ppl +source=accounts +| fields firstname, *name +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------+----------+ +| firstname | lastname | +|-----------+----------| +| Amber | Duke | +| Hattie | Bond | +| Nanette | Bates | +| Dale | Adams | ++-----------+----------+ +``` + +Note: Even though `firstname` is explicitly specified and would also match `*name`, it appears only once due to automatic deduplication. +## Example 9: Full wildcard selection + +Select all available fields using `*` or `` `*` ``. This selects all fields defined in the index schema, including fields that may contain null values. + +```ppl +source=accounts +| fields `*` +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------+-----------+-----------------+---------+--------+--------+----------+-------+-----+----------------------+----------+ +| account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | +|----------------+-----------+-----------------+---------+--------+--------+----------+-------+-----+----------------------+----------| +| 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | ++----------------+-----------+-----------------+---------+--------+--------+----------+-------+-----+----------------------+----------+ +``` + +Note: The `*` wildcard selects fields based on the index schema, not on data content. Fields with null values are included in the result set. Use backticks `` `*` ` if the plain `*`` doesn't return all expected fields. +## Example 10: Wildcard exclusion + +Remove fields using wildcard patterns with the minus (-) operator. + +```ppl +source=accounts +| fields - *name +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+ +| account_number | address | balance | gender | city | employer | state | age | email | +|----------------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------| +| 1 | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | +| 6 | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | +| 13 | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | +| 18 | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | ++----------------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+ +``` + +## See Also + +- [table](table.md) - Alias command with identical functionality \ No newline at end of file diff --git a/docs/user/ppl/cmd/fields.rst b/docs/user/ppl/cmd/fields.rst deleted file mode 100644 index 81ccff71b80..00000000000 --- a/docs/user/ppl/cmd/fields.rst +++ /dev/null @@ -1,206 +0,0 @@ -====== -fields -====== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -The ``fields`` command keeps or removes fields from the search result. - -Syntax -====== -fields [+|-] - -* +|-: optional. If the plus (+) is used, only the fields specified in the field list will be kept. If the minus (-) is used, all the fields specified in the field list will be removed. **Default:** +. -* field-list: mandatory. Comma-delimited or space-delimited list of fields to keep or remove. Supports wildcard patterns. - -Example 1: Select specified fields from result -============================================== - -This example shows selecting account_number, firstname and lastname fields from search results. - -PPL query:: - - os> source=accounts | fields account_number, firstname, lastname; - fetched rows / total rows = 4/4 - +----------------+-----------+----------+ - | account_number | firstname | lastname | - |----------------+-----------+----------| - | 1 | Amber | Duke | - | 6 | Hattie | Bond | - | 13 | Nanette | Bates | - | 18 | Dale | Adams | - +----------------+-----------+----------+ - -Example 2: Remove specified fields from result -============================================== - -This example shows removing the account_number field from search results. - -PPL query:: - - os> source=accounts | fields account_number, firstname, lastname | fields - account_number ; - fetched rows / total rows = 4/4 - +-----------+----------+ - | firstname | lastname | - |-----------+----------| - | Amber | Duke | - | Hattie | Bond | - | Nanette | Bates | - | Dale | Adams | - +-----------+----------+ - -Example 3: Space-delimited field selection -========================================== - -Fields can be specified using spaces instead of commas, providing a more concise syntax. - -**Syntax**: ``fields field1 field2 field3`` - -PPL query:: - - os> source=accounts | fields firstname lastname age; - fetched rows / total rows = 4/4 - +-----------+----------+-----+ - | firstname | lastname | age | - |-----------+----------+-----| - | Amber | Duke | 32 | - | Hattie | Bond | 36 | - | Nanette | Bates | 28 | - | Dale | Adams | 33 | - +-----------+----------+-----+ - -Example 4: Prefix wildcard pattern -================================== - -Select fields starting with a pattern using prefix wildcards. - -PPL query:: - - os> source=accounts | fields account*; - fetched rows / total rows = 4/4 - +----------------+ - | account_number | - |----------------| - | 1 | - | 6 | - | 13 | - | 18 | - +----------------+ - -Example 5: Suffix wildcard pattern -================================== - -Select fields ending with a pattern using suffix wildcards. - -PPL query:: - - os> source=accounts | fields *name; - fetched rows / total rows = 4/4 - +-----------+----------+ - | firstname | lastname | - |-----------+----------| - | Amber | Duke | - | Hattie | Bond | - | Nanette | Bates | - | Dale | Adams | - +-----------+----------+ - -Example 6: Contains wildcard pattern -==================================== - -Select fields containing a pattern using contains wildcards. - -PPL query:: - - os> source=accounts | fields *a* | head 1; - fetched rows / total rows = 1/1 - +----------------+-----------+-----------------+---------+-------+-----+----------------------+----------+ - | account_number | firstname | address | balance | state | age | email | lastname | - |----------------+-----------+-----------------+---------+-------+-----+----------------------+----------| - | 1 | Amber | 880 Holmes Lane | 39225 | IL | 32 | amberduke@pyrami.com | Duke | - +----------------+-----------+-----------------+---------+-------+-----+----------------------+----------+ - -Example 7: Mixed delimiter syntax -================================= - -Combine spaces and commas for flexible field specification. - -PPL query:: - - os> source=accounts | fields firstname, account* *name; - fetched rows / total rows = 4/4 - +-----------+----------------+----------+ - | firstname | account_number | lastname | - |-----------+----------------+----------| - | Amber | 1 | Duke | - | Hattie | 6 | Bond | - | Nanette | 13 | Bates | - | Dale | 18 | Adams | - +-----------+----------------+----------+ - -Example 8: Field deduplication -============================== - -Automatically prevents duplicate columns when wildcards expand to already specified fields. - -PPL query:: - - os> source=accounts | fields firstname, *name; - fetched rows / total rows = 4/4 - +-----------+----------+ - | firstname | lastname | - |-----------+----------| - | Amber | Duke | - | Hattie | Bond | - | Nanette | Bates | - | Dale | Adams | - +-----------+----------+ - -Note: Even though ``firstname`` is explicitly specified and would also match ``*name``, it appears only once due to automatic deduplication. - -Example 9: Full wildcard selection -================================== - -Select all available fields using ``*`` or ```*```. This selects all fields defined in the index schema, including fields that may contain null values. - -PPL query:: - - os> source=accounts | fields `*` | head 1; - fetched rows / total rows = 1/1 - +----------------+-----------+-----------------+---------+--------+--------+----------+-------+-----+----------------------+----------+ - | account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | - |----------------+-----------+-----------------+---------+--------+--------+----------+-------+-----+----------------------+----------| - | 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | - +----------------+-----------+-----------------+---------+--------+--------+----------+-------+-----+----------------------+----------+ - -Note: The ``*`` wildcard selects fields based on the index schema, not on data content. Fields with null values are included in the result set. Use backticks ```*``` if the plain ``*`` doesn't return all expected fields. - -Example 10: Wildcard exclusion -============================== - -Remove fields using wildcard patterns with the minus (-) operator. - -PPL query:: - - os> source=accounts | fields - *name; - fetched rows / total rows = 4/4 - +----------------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+ - | account_number | address | balance | gender | city | employer | state | age | email | - |----------------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------| - | 1 | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | - | 6 | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | - | 13 | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | - | 18 | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | - +----------------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+ - - -See Also -======== -- `table `_ - Alias command with identical functionality diff --git a/docs/user/ppl/cmd/fillnull.md b/docs/user/ppl/cmd/fillnull.md new file mode 100644 index 00000000000..40ed91e8653 --- /dev/null +++ b/docs/user/ppl/cmd/fillnull.md @@ -0,0 +1,176 @@ +# fillnull + +## Description + +The `fillnull` command fills null values with the provided value in one or more fields in the search result. +## Syntax + +fillnull with \ [in \] +fillnull using \ = \ [, \ = \] +fillnull value=\ [\] +* replacement: mandatory. The value used to replace null values. +* field-list: optional. List of fields to apply the replacement to. It can be comma-delimited (with `with` or `using` syntax) or space-delimited (with `value=` syntax). **Default:** all fields. +* field: mandatory when using `using` syntax. Individual field name to assign a specific replacement value. +* **Syntax variations** + * `with in ` - Apply same value to specified fields + * `using =, ...` - Apply different values to different fields + * `value= []` - Alternative syntax with optional space-delimited field list + +## Example 1: Replace null values with a specified value on one field + +This example shows replacing null values in the email field with '\'. + +```ppl +source=accounts +| fields email, employer +| fillnull with '' in email +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------------------+----------+ +| email | employer | +|-----------------------+----------| +| amberduke@pyrami.com | Pyrami | +| hattiebond@netagy.com | Netagy | +| | Quility | +| daleadams@boink.com | null | ++-----------------------+----------+ +``` + +## Example 2: Replace null values with a specified value on multiple fields + +This example shows replacing null values in both email and employer fields with the same replacement value '\'. + +```ppl +source=accounts +| fields email, employer +| fillnull with '' in email, employer +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------------------+-------------+ +| email | employer | +|-----------------------+-------------| +| amberduke@pyrami.com | Pyrami | +| hattiebond@netagy.com | Netagy | +| | Quility | +| daleadams@boink.com | | ++-----------------------+-------------+ +``` + +## Example 3: Replace null values with a specified value on all fields + +This example shows replacing null values in all fields when no field list is specified. + +```ppl +source=accounts +| fields email, employer +| fillnull with '' +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------------------+-------------+ +| email | employer | +|-----------------------+-------------| +| amberduke@pyrami.com | Pyrami | +| hattiebond@netagy.com | Netagy | +| | Quility | +| daleadams@boink.com | | ++-----------------------+-------------+ +``` + +## Example 4: Replace null values with multiple specified values on multiple fields + +This example shows using different replacement values for different fields using the 'using' syntax. + +```ppl +source=accounts +| fields email, employer +| fillnull using email = '', employer = '' +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------------------+---------------+ +| email | employer | +|-----------------------+---------------| +| amberduke@pyrami.com | Pyrami | +| hattiebond@netagy.com | Netagy | +| | Quility | +| daleadams@boink.com | | ++-----------------------+---------------+ +``` + +## Example 5: Replace null with specified value on specific fields (value= syntax) + +This example shows using the alternative 'value=' syntax to replace null values in specific fields. + +```ppl +source=accounts +| fields email, employer +| fillnull value="" email employer +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------------------+-------------+ +| email | employer | +|-----------------------+-------------| +| amberduke@pyrami.com | Pyrami | +| hattiebond@netagy.com | Netagy | +| | Quility | +| daleadams@boink.com | | ++-----------------------+-------------+ +``` + +## Example 6: Replace null with specified value on all fields (value= syntax) + +When no field list is specified, the replacement applies to all fields in the result. + +```ppl +source=accounts +| fields email, employer +| fillnull value='' +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------------------+-------------+ +| email | employer | +|-----------------------+-------------| +| amberduke@pyrami.com | Pyrami | +| hattiebond@netagy.com | Netagy | +| | Quility | +| daleadams@boink.com | | ++-----------------------+-------------+ +``` + +## Limitations + +* The `fillnull` command is not rewritten to OpenSearch DSL, it is only executed on the coordination node. +* When applying the same value to all fields without specifying field names, all fields must be the same type. For mixed types, use separate fillnull commands or explicitly specify fields. +* The replacement value type must match ALL field types in the field list. When applying the same value to multiple fields, all fields must be the same type (all strings or all numeric). + + **Example:** + +```sql ignore + # This FAILS - same value for mixed-type fields + source=accounts | fillnull value=0 firstname, age + # ERROR: fillnull failed: replacement value type INTEGER is not compatible with field 'firstname' (type: VARCHAR). The replacement value type must match the field type. +``` + \ No newline at end of file diff --git a/docs/user/ppl/cmd/fillnull.rst b/docs/user/ppl/cmd/fillnull.rst deleted file mode 100644 index 7ebceee019a..00000000000 --- a/docs/user/ppl/cmd/fillnull.rst +++ /dev/null @@ -1,156 +0,0 @@ -======== -fillnull -======== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| The ``fillnull`` command fills null values with the provided value in one or more fields in the search result. - - -Syntax -====== - -| fillnull with [in ] -| fillnull using = [, = ] -| fillnull value= [] - -* replacement: mandatory. The value used to replace null values. -* field-list: optional. List of fields to apply the replacement to. Can be comma-delimited (with ``with`` or ``using`` syntax) or space-delimited (with ``value=`` syntax). **Default:** all fields. -* field: mandatory when using ``using`` syntax. Individual field name to assign a specific replacement value. - -* **Syntax variations** - - * ``with in `` - Apply same value to specified fields - * ``using =, ...`` - Apply different values to different fields - * ``value= []`` - Alternative syntax with optional space-delimited field list - -Example 1: Replace null values with a specified value on one field -================================================================== - -This example shows replacing null values in the email field with ''. - -PPL query:: - - os> source=accounts | fields email, employer | fillnull with '' in email; - fetched rows / total rows = 4/4 - +-----------------------+----------+ - | email | employer | - |-----------------------+----------| - | amberduke@pyrami.com | Pyrami | - | hattiebond@netagy.com | Netagy | - | | Quility | - | daleadams@boink.com | null | - +-----------------------+----------+ - -Example 2: Replace null values with a specified value on multiple fields -======================================================================== - -This example shows replacing null values in both email and employer fields with the same replacement value ''. - -PPL query:: - - os> source=accounts | fields email, employer | fillnull with '' in email, employer; - fetched rows / total rows = 4/4 - +-----------------------+-------------+ - | email | employer | - |-----------------------+-------------| - | amberduke@pyrami.com | Pyrami | - | hattiebond@netagy.com | Netagy | - | | Quility | - | daleadams@boink.com | | - +-----------------------+-------------+ - -Example 3: Replace null values with a specified value on all fields -=================================================================== - -This example shows replacing null values in all fields when no field list is specified. - -PPL query:: - - PPL> source=accounts | fields email, employer | fillnull with ''; - fetched rows / total rows = 4/4 - +-----------------------+-------------+ - | email | employer | - |-----------------------+-------------| - | amberduke@pyrami.com | Pyrami | - | hattiebond@netagy.com | Netagy | - | | Quility | - | daleadams@boink.com | | - +-----------------------+-------------+ - -Example 4: Replace null values with multiple specified values on multiple fields -================================================================================ - -This example shows using different replacement values for different fields using the 'using' syntax. - -PPL query:: - - os> source=accounts | fields email, employer | fillnull using email = '', employer = ''; - fetched rows / total rows = 4/4 - +-----------------------+---------------+ - | email | employer | - |-----------------------+---------------| - | amberduke@pyrami.com | Pyrami | - | hattiebond@netagy.com | Netagy | - | | Quility | - | daleadams@boink.com | | - +-----------------------+---------------+ - - -Example 5: Replace null with specified value on specific fields (value= syntax) -=============================================================================== - -This example shows using the alternative 'value=' syntax to replace null values in specific fields. - -PPL query:: - - os> source=accounts | fields email, employer | fillnull value="" email employer; - fetched rows / total rows = 4/4 - +-----------------------+-------------+ - | email | employer | - |-----------------------+-------------| - | amberduke@pyrami.com | Pyrami | - | hattiebond@netagy.com | Netagy | - | | Quility | - | daleadams@boink.com | | - +-----------------------+-------------+ - -Example 6: Replace null with specified value on all fields (value= syntax) -========================================================================== - -When no field list is specified, the replacement applies to all fields in the result. - -PPL query:: - - os> source=accounts | fields email, employer | fillnull value=''; - fetched rows / total rows = 4/4 - +-----------------------+-------------+ - | email | employer | - |-----------------------+-------------| - | amberduke@pyrami.com | Pyrami | - | hattiebond@netagy.com | Netagy | - | | Quility | - | daleadams@boink.com | | - +-----------------------+-------------+ - -Limitations -=========== -* The ``fillnull`` command is not rewritten to OpenSearch DSL, it is only executed on the coordination node. -* When applying the same value to all fields without specifying field names, all fields must be the same type. For mixed types, use separate fillnull commands or explicitly specify fields. -* The replacement value type must match ALL field types in the field list. When applying the same value to multiple fields, all fields must be the same type (all strings or all numeric). - - **Example:** - - .. code-block:: sql - - # This FAILS - same value for mixed-type fields - source=accounts | fillnull value=0 firstname, age - # ERROR: fillnull failed: replacement value type INTEGER is not compatible with field 'firstname' (type: VARCHAR). The replacement value type must match the field type. - diff --git a/docs/user/ppl/cmd/flatten.md b/docs/user/ppl/cmd/flatten.md new file mode 100644 index 00000000000..ba4f9077dcb --- /dev/null +++ b/docs/user/ppl/cmd/flatten.md @@ -0,0 +1,93 @@ +# flatten + +## Description + +The `flatten` command flattens a struct or an object field into separate fields in a document. +The flattened fields will be ordered **lexicographically** by their original key names in the struct. For example, if the struct has keys `b`, `c` and `Z`, the flattened fields will be ordered as `Z`, `b`, `c`. +Note that `flatten` should not be applied to arrays. Use the `expand` command to expand an array field into multiple rows instead. However, since an array can be stored in a non-array field in OpenSearch, when flattening a field storing a nested array, only the first element of the array will be flattened. +## Syntax + +flatten \ [as (\)] +* field: mandatory. The field to be flattened. Only object and nested fields are supported. +* alias-list: optional. The names to use instead of the original key names. Names are separated by commas. It is advised to put the alias-list in parentheses if there is more than one alias. The length must match the number of keys in the struct field. The provided alias names **must** follow the lexicographical order of the corresponding original keys in the struct. + +## Example: flatten an object field with aliases + +This example shows flattening a message object field and using aliases to rename the flattened fields. +Given the following index `my-index` + +```text + {"message":{"info":"a","author":"e","dayOfWeek":1},"myNum":1} + {"message":{"info":"b","author":"f","dayOfWeek":2},"myNum":2} + +``` + +with the following mapping: + +```json + { + "mappings": { + "properties": { + "message": { + "type": "object", + "properties": { + "info": { + "type": "keyword", + "index": "true" + }, + "author": { + "type": "keyword", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + }, + "index": "true" + }, + "dayOfWeek": { + "type": "long" + } + } + }, + "myNum": { + "type": "long" + } + } + } + } + + +``` + +The following query flattens the `message` field and renames the keys to +`creator, dow, info`: + +```ppl +source=my-index +| flatten message as (creator, dow, info) +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++-----------------------------------------+--------+---------+-----+------+ +| message | myNum | creator | dow | info | +|-----------------------------------------|--------|---------|-----|------| +| {"info":"a","author":"e","dayOfWeek":1} | 1 | e | 1 | a | +| {"info":"b","author":"f","dayOfWeek":2} | 2 | f | 2 | b | ++-----------------------------------------+--------+---------+-----+------+ +``` + +## Limitations + +* `flatten` command may not work as expected when its flattened fields are + + invisible. + For example in query + `source=my-index | fields message | flatten message`, the + `flatten message` command doesn't work since some flattened fields such as + `message.info` and `message.author` after command `fields message` are + invisible. + As an alternative, you can change to `source=my-index | flatten message`. \ No newline at end of file diff --git a/docs/user/ppl/cmd/flatten.rst b/docs/user/ppl/cmd/flatten.rst deleted file mode 100644 index e366fe32daa..00000000000 --- a/docs/user/ppl/cmd/flatten.rst +++ /dev/null @@ -1,101 +0,0 @@ -======= -flatten -======= - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - -Description -=========== -| The ``flatten`` command flattens a struct or an object field into separate fields in a document. - -| The flattened fields will be ordered **lexicographically** by their original key names in the struct. For example, if the struct has keys ``b``, ``c`` and ``Z``, the flattened fields will be ordered as ``Z``, ``b``, ``c``. - -| Note that ``flatten`` should not be applied to arrays. Use the ``expand`` command to expand an array field into multiple rows instead. However, since an array can be stored in a non-array field in OpenSearch, when flattening a field storing a nested array, only the first element of the array will be flattened. - -Syntax -====== - -flatten [as ()] - -* field: mandatory. The field to be flattened. Only object and nested fields are supported. -* alias-list: optional. The names to use instead of the original key names. Names are separated by commas. It is advised to put the alias-list in parentheses if there is more than one alias. The length must match the number of keys in the struct field. The provided alias names **must** follow the lexicographical order of the corresponding original keys in the struct. - -Example: flatten an object field with aliases -============================================= - -This example shows flattening a message object field and using aliases to rename the flattened fields. - -Given the following index ``my-index`` - -.. code-block:: - - {"message":{"info":"a","author":"e","dayOfWeek":1},"myNum":1} - {"message":{"info":"b","author":"f","dayOfWeek":2},"myNum":2} - -with the following mapping: - -.. code-block:: json - - { - "mappings": { - "properties": { - "message": { - "type": "object", - "properties": { - "info": { - "type": "keyword", - "index": "true" - }, - "author": { - "type": "keyword", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - }, - "index": "true" - }, - "dayOfWeek": { - "type": "long" - } - } - }, - "myNum": { - "type": "long" - } - } - } - } - - -The following query flattens the ``message`` field and renames the keys to -``creator, dow, info``: - -PPL query:: - - PPL> source=my-index | flatten message as (creator, dow, info); - fetched rows / total rows = 2/2 - +-----------------------------------------+--------+---------+-----+------+ - | message | myNum | creator | dow | info | - |-----------------------------------------|--------|---------|-----|------| - | {"info":"a","author":"e","dayOfWeek":1} | 1 | e | 1 | a | - | {"info":"b","author":"f","dayOfWeek":2} | 2 | f | 2 | b | - +-----------------------------------------+--------+---------+-----+------+ - -Limitations -=========== -* ``flatten`` command may not work as expected when its flattened fields are - invisible. - - For example in query - ``source=my-index | fields message | flatten message``, the - ``flatten message`` command doesn't work since some flattened fields such as - ``message.info`` and ``message.author`` after command ``fields message`` are - invisible. - - As an alternative, you can change to ``source=my-index | flatten message``. diff --git a/docs/user/ppl/cmd/grok.md b/docs/user/ppl/cmd/grok.md new file mode 100644 index 00000000000..c2636b5358b --- /dev/null +++ b/docs/user/ppl/cmd/grok.md @@ -0,0 +1,86 @@ +# grok + +## Description + +The `grok` command parses a text field with a grok pattern and appends the results to the search result. +## Syntax + +grok \ \ +* field: mandatory. The field must be a text field. +* pattern: mandatory. The grok pattern used to extract new fields from the given text field. If a new field name already exists, it will replace the original field. + +## Example 1: Create the new field + +This example shows how to create new field `host` for each document. `host` will be the host name after `@` in `email` field. Parsing a null field will return an empty string. + +```ppl +source=accounts +| grok email '.+@%{HOSTNAME:host}' +| fields email, host +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------------------+------------+ +| email | host | +|-----------------------+------------| +| amberduke@pyrami.com | pyrami.com | +| hattiebond@netagy.com | netagy.com | +| null | | +| daleadams@boink.com | boink.com | ++-----------------------+------------+ +``` + +## Example 2: Override the existing field + +This example shows how to override the existing `address` field with street number removed. + +```ppl +source=accounts +| grok address '%{NUMBER} %{GREEDYDATA:address}' +| fields address +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++------------------+ +| address | +|------------------| +| Holmes Lane | +| Bristol Street | +| Madison Street | +| Hutchinson Court | ++------------------+ +``` + +## Example 3: Using grok to parse logs + +This example shows how to use grok to parse raw logs. + +```ppl +source=apache +| grok message '%{COMMONAPACHELOG}' +| fields COMMONAPACHELOG, timestamp, response, bytes +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------------------------------------------------------------------------------------------------------------------------+----------------------------+----------+-------+ +| COMMONAPACHELOG | timestamp | response | bytes | +|-----------------------------------------------------------------------------------------------------------------------------+----------------------------+----------+-------| +| 177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927 | 28/Sep/2022:10:15:57 -0700 | 404 | 19927 | +| 127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722 | 28/Sep/2022:10:15:57 -0700 | 100 | 28722 | +| 118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439 | 28/Sep/2022:10:15:57 -0700 | 401 | 27439 | +| 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | 28/Sep/2022:10:15:57 -0700 | 301 | 9481 | ++-----------------------------------------------------------------------------------------------------------------------------+----------------------------+----------+-------+ +``` + +## Limitations + +The grok command has the same limitations as the parse command, see [parse limitations](./parse.md#Limitations) for details. \ No newline at end of file diff --git a/docs/user/ppl/cmd/grok.rst b/docs/user/ppl/cmd/grok.rst deleted file mode 100644 index 836d01b6a89..00000000000 --- a/docs/user/ppl/cmd/grok.rst +++ /dev/null @@ -1,81 +0,0 @@ -==== -grok -==== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -The ``grok`` command parses a text field with a grok pattern and appends the results to the search result. - -Syntax -====== -grok - -* field: mandatory. The field must be a text field. -* pattern: mandatory. The grok pattern used to extract new fields from the given text field. If a new field name already exists, it will replace the original field. - -Example 1: Create the new field -=============================== - -This example shows how to create new field ``host`` for each document. ``host`` will be the host name after ``@`` in ``email`` field. Parsing a null field will return an empty string. - -PPL query:: - - os> source=accounts | grok email '.+@%{HOSTNAME:host}' | fields email, host ; - fetched rows / total rows = 4/4 - +-----------------------+------------+ - | email | host | - |-----------------------+------------| - | amberduke@pyrami.com | pyrami.com | - | hattiebond@netagy.com | netagy.com | - | null | | - | daleadams@boink.com | boink.com | - +-----------------------+------------+ - - -Example 2: Override the existing field -====================================== - -This example shows how to override the existing ``address`` field with street number removed. - -PPL query:: - - os> source=accounts | grok address '%{NUMBER} %{GREEDYDATA:address}' | fields address ; - fetched rows / total rows = 4/4 - +------------------+ - | address | - |------------------| - | Holmes Lane | - | Bristol Street | - | Madison Street | - | Hutchinson Court | - +------------------+ - -Example 3: Using grok to parse logs -=================================== - -This example shows how to use grok to parse raw logs. - -PPL query:: - - os> source=apache | grok message '%{COMMONAPACHELOG}' | fields COMMONAPACHELOG, timestamp, response, bytes ; - fetched rows / total rows = 4/4 - +-----------------------------------------------------------------------------------------------------------------------------+----------------------------+----------+-------+ - | COMMONAPACHELOG | timestamp | response | bytes | - |-----------------------------------------------------------------------------------------------------------------------------+----------------------------+----------+-------| - | 177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927 | 28/Sep/2022:10:15:57 -0700 | 404 | 19927 | - | 127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722 | 28/Sep/2022:10:15:57 -0700 | 100 | 28722 | - | 118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439 | 28/Sep/2022:10:15:57 -0700 | 401 | 27439 | - | 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | 28/Sep/2022:10:15:57 -0700 | 301 | 9481 | - +-----------------------------------------------------------------------------------------------------------------------------+----------------------------+----------+-------+ - -Limitations -=========== - -The grok command has the same limitations as the parse command, see `parse limitations <./parse.rst#Limitations>`_ for details. diff --git a/docs/user/ppl/cmd/head.md b/docs/user/ppl/cmd/head.md new file mode 100644 index 00000000000..5565c90d782 --- /dev/null +++ b/docs/user/ppl/cmd/head.md @@ -0,0 +1,84 @@ +# head + +## Description + +The `head` command returns the first N number of specified results after an optional offset in search order. +## Syntax + +head [\] [from \] +* size: optional integer. Number of results to return. **Default:** 10 +* offset: optional integer after `from`. Number of results to skip. **Default:** 0 + +## Example 1: Get first 10 results + +This example shows getting a maximum of 10 results from accounts index. + +```ppl +source=accounts +| fields firstname, age +| head +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------+-----+ +| firstname | age | +|-----------+-----| +| Amber | 32 | +| Hattie | 36 | +| Nanette | 28 | +| Dale | 33 | ++-----------+-----+ +``` + +## Example 2: Get first N results + +This example shows getting the first 3 results from accounts index. + +```ppl +source=accounts +| fields firstname, age +| head 3 +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++-----------+-----+ +| firstname | age | +|-----------+-----| +| Amber | 32 | +| Hattie | 36 | +| Nanette | 28 | ++-----------+-----+ +``` + +## Example 3: Get first N results after offset M + +This example shows getting the first 3 results after offset 1 from accounts index. + +```ppl +source=accounts +| fields firstname, age +| head 3 from 1 +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++-----------+-----+ +| firstname | age | +|-----------+-----| +| Hattie | 36 | +| Nanette | 28 | +| Dale | 33 | ++-----------+-----+ +``` + +## Limitations + +The `head` command is not rewritten to OpenSearch DSL, it is only executed on the coordination node. \ No newline at end of file diff --git a/docs/user/ppl/cmd/head.rst b/docs/user/ppl/cmd/head.rst deleted file mode 100644 index a17f283026d..00000000000 --- a/docs/user/ppl/cmd/head.rst +++ /dev/null @@ -1,77 +0,0 @@ -==== -head -==== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -The ``head`` command returns the first N number of specified results after an optional offset in search order. - -Syntax -====== -head [] [from ] - -* size: optional integer. Number of results to return. **Default:** 10 -* offset: optional integer after ``from``. Number of results to skip. **Default:** 0 - -Example 1: Get first 10 results -=============================== - -This example shows getting a maximum of 10 results from accounts index. - -PPL query:: - - os> source=accounts | fields firstname, age | head; - fetched rows / total rows = 4/4 - +-----------+-----+ - | firstname | age | - |-----------+-----| - | Amber | 32 | - | Hattie | 36 | - | Nanette | 28 | - | Dale | 33 | - +-----------+-----+ - -Example 2: Get first N results -============================== - -This example shows getting the first 3 results from accounts index. - -PPL query:: - - os> source=accounts | fields firstname, age | head 3; - fetched rows / total rows = 3/3 - +-----------+-----+ - | firstname | age | - |-----------+-----| - | Amber | 32 | - | Hattie | 36 | - | Nanette | 28 | - +-----------+-----+ - -Example 3: Get first N results after offset M -============================================= - -This example shows getting the first 3 results after offset 1 from accounts index. - -PPL query:: - - os> source=accounts | fields firstname, age | head 3 from 1; - fetched rows / total rows = 3/3 - +-----------+-----+ - | firstname | age | - |-----------+-----| - | Hattie | 36 | - | Nanette | 28 | - | Dale | 33 | - +-----------+-----+ - -Limitations -=========== -The ``head`` command is not rewritten to OpenSearch DSL, it is only executed on the coordination node. diff --git a/docs/user/ppl/cmd/join.md b/docs/user/ppl/cmd/join.md new file mode 100644 index 00000000000..39d3f5a24d2 --- /dev/null +++ b/docs/user/ppl/cmd/join.md @@ -0,0 +1,214 @@ +# join + +## Description + +The `join` command combines two datasets together. The left side could be an index or results from a piped commands, the right side could be either an index or a subsearch. +## Syntax + + +### Basic syntax: + +[joinType] join [leftAlias] [rightAlias] (on \| where) \ \ +* joinType: optional. The type of join to perform. Options: `left`, `semi`, `anti`, and performance-sensitive types `right`, `full`, `cross`. **Default:** `inner`. +* leftAlias: optional. The subsearch alias to use with the left join side, to avoid ambiguous naming. Pattern: `left = ` +* rightAlias: optional. The subsearch alias to use with the right join side, to avoid ambiguous naming. Pattern: `right = ` +* joinCriteria: mandatory. Any comparison expression. Must follow `on` or `where` keyword. +* right-dataset: mandatory. Right dataset could be either an `index` or a `subsearch` with/without alias. + +### Extended syntax: + +join [type=] [overwrite=] [max=n] (\ \| [leftAlias] [rightAlias] (on \| where) \) \ +* type: optional. Join type using extended syntax. Options: `left`, `outer` (alias of `left`), `semi`, `anti`, and performance-sensitive types `right`, `full`, `cross`. **Default:** `inner`. +* overwrite: optional boolean. Only works with `join-field-list`. Specifies whether duplicate-named fields from right-dataset should replace corresponding fields in the main search results. **Default:** `true`. +* max: optional integer. Controls how many subsearch results could be joined against each row in main search. **Default:** 0 (unlimited). +* join-field-list: optional. The fields used to build the join criteria. The join field list must exist on both sides. If not specified, all fields common to both sides will be used as join keys. +* leftAlias: optional. Same as basic syntax when used with extended syntax. +* rightAlias: optional. Same as basic syntax when used with extended syntax. +* joinCriteria: mandatory. Same as basic syntax when used with extended syntax. +* right-dataset: mandatory. Same as basic syntax. + +## Configuration + +### plugins.ppl.join.subsearch_maxout + +The size configures the maximum of rows from subsearch to join against. The default value is: `50000`. A value of `0` indicates that the restriction is unlimited. +Change the join.subsearch_maxout to 5000 + +```bash ignore +curl -sS -H 'Content-Type: application/json' \ +-X PUT localhost:9200/_plugins/_query/settings \ +-d '{"persistent" : {"plugins.ppl.join.subsearch_maxout" : "5000"}}' +``` + +```json +{ + "acknowledged": true, + "persistent": { + "plugins": { + "ppl": { + "join": { + "subsearch_maxout": "5000" + } + } + } + }, + "transient": {} +} +``` + +## Usage + +Basic join syntax: + +``` +source = table1 | inner join left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c +source = table1 | inner join left = l right = r where l.a = r.a table2 | fields l.a, r.a, b, c +source = table1 | left join left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c +source = table1 | right join left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c +source = table1 | full left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c +source = table1 | cross join left = l right = r on 1=1 table2 +source = table1 | left semi join left = l right = r on l.a = r.a table2 +source = table1 | left anti join left = l right = r on l.a = r.a table2 +source = table1 | join left = l right = r [ source = table2 | where d > 10 | head 5 ] +source = table1 | inner join on table1.a = table2.a table2 | fields table1.a, table2.a, table1.b, table1.c +source = table1 | inner join on a = c table2 | fields a, b, c, d +source = table1 as t1 | join left = l right = r on l.a = r.a table2 as t2 | fields l.a, r.a +source = table1 as t1 | join left = l right = r on l.a = r.a table2 as t2 | fields t1.a, t2.a +source = table1 | join left = l right = r on l.a = r.a [ source = table2 ] as s | fields l.a, s.a +``` + +Extended syntax with options: + +``` +source = table1 | join type=outer left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c +source = table1 | join type=left left = l right = r where l.a = r.a table2 | fields l.a, r.a, b, c +source = table1 | join type=inner max=1 left = l right = r where l.a = r.a table2 | fields l.a, r.a, b, c +source = table1 | join a table2 | fields a, b, c +source = table1 | join a, b table2 | fields a, b, c +source = table1 | join type=outer a b table2 | fields a, b, c +source = table1 | join type=inner max=1 a, b table2 | fields a, b, c +source = table1 | join type=left overwrite=false max=0 a, b [source=table2 | rename d as b] | fields a, b, c +``` + +## Example 1: Two indices join + +This example shows joining two indices using the basic join syntax. + +```ppl +source = state_country +| inner join left=a right=b ON a.name = b.name occupation +| stats avg(salary) by span(age, 10) as age_span, b.country +``` + +Expected output: + +```text +fetched rows / total rows = 5/5 ++-------------+----------+-----------+ +| avg(salary) | age_span | b.country | +|-------------+----------+-----------| +| 120000.0 | 40 | USA | +| 105000.0 | 20 | Canada | +| 0.0 | 40 | Canada | +| 70000.0 | 30 | USA | +| 100000.0 | 70 | England | ++-------------+----------+-----------+ +``` + +## Example 2: Join with subsearch + +This example shows joining with a subsearch using the basic join syntax. + +```ppl +source = state_country as a +| where country = 'USA' OR country = 'England' +| left join ON a.name = b.name [ source = occupation +| where salary > 0 +| fields name, country, salary +| sort salary +| head 3 ] as b +| stats avg(salary) by span(age, 10) as age_span, b.country +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++-------------+----------+-----------+ +| avg(salary) | age_span | b.country | +|-------------+----------+-----------| +| null | 40 | null | +| 70000.0 | 30 | USA | +| 100000.0 | 70 | England | ++-------------+----------+-----------+ +``` + +## Example 3: Join with field list + +This example shows joining using the extended syntax with field list. + +```ppl +source = state_country +| where country = 'USA' OR country = 'England' +| join type=left overwrite=true name [ source = occupation +| where salary > 0 +| fields name, country, salary +| sort salary +| head 3 ] +| stats avg(salary) by span(age, 10) as age_span, country +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++-------------+----------+---------+ +| avg(salary) | age_span | country | +|-------------+----------+---------| +| null | 40 | null | +| 70000.0 | 30 | USA | +| 100000.0 | 70 | England | ++-------------+----------+---------+ +``` + +## Example 4: Join with options + +This example shows joining using the extended syntax with additional options. + +```ppl +source = state_country +| join type=inner overwrite=false max=1 name occupation +| stats avg(salary) by span(age, 10) as age_span, country +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-------------+----------+---------+ +| avg(salary) | age_span | country | +|-------------+----------+---------| +| 120000.0 | 40 | USA | +| 100000.0 | 70 | USA | +| 105000.0 | 20 | Canada | +| 70000.0 | 30 | USA | ++-------------+----------+---------+ +``` + +## Limitations + +For basic syntax, if fields in the left outputs and right outputs have the same name. Typically, in the join criteria +`ON t1.id = t2.id`, the names `id` in output are ambiguous. To avoid ambiguous, the ambiguous +fields in output rename to `.id`, or else `.id` if no alias existing. + +Assume table1 and table2 only contain field `id`, following PPL queries and their outputs are: + +| Query | Output | +| --- | --- | +| source=table1 \| join left=t1 right=t2 on t1.id=t2.id table2 \| eval a = 1 | t1.id, t2.id, a | +| source=table1 \| join on table1.id=table2.id table2 \| eval a = 1 | table1.id, table2.id, a | +| source=table1 \| join on table1.id=t2.id table2 as t2 \| eval a = 1 | table1.id, t2.id, a | +| source=table1 \| join right=tt on table1.id=t2.id [ source=table2 as t2 \| eval b = id ] \| eval a = 1 | table1.id, tt.id, tt.b, a | + +For extended syntax (join with field list), when duplicate-named fields in output results are deduplicated, the fields in output determined by the value of 'overwrite' option. +Join types `inner`, `left`, `outer` (alias of `left`), `semi` and `anti` are supported by default. `right`, `full`, `cross` are performance-sensitive join types which are disabled by default. Set config `plugins.calcite.all_join_types.allowed = true` to enable. \ No newline at end of file diff --git a/docs/user/ppl/cmd/join.rst b/docs/user/ppl/cmd/join.rst deleted file mode 100644 index 61dfc31042d..00000000000 --- a/docs/user/ppl/cmd/join.rst +++ /dev/null @@ -1,198 +0,0 @@ -==== -join -==== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| The ``join`` command combines two datasets together. The left side could be an index or results from a piped commands, the right side could be either an index or a subsearch. - -Syntax -====== - -Basic syntax: -------------- - -[joinType] join [leftAlias] [rightAlias] (on | where) - -* joinType: optional. The type of join to perform. Options: ``left``, ``semi``, ``anti``, and performance sensitive types ``right``, ``full``, ``cross``. **Default:** ``inner``. -* leftAlias: optional. The subsearch alias to use with the left join side, to avoid ambiguous naming. Pattern: ``left = `` -* rightAlias: optional. The subsearch alias to use with the right join side, to avoid ambiguous naming. Pattern: ``right = `` -* joinCriteria: mandatory. Any comparison expression. Must follow ``on`` or ``where`` keyword. -* right-dataset: mandatory. Right dataset could be either an ``index`` or a ``subsearch`` with/without alias. - -Extended syntax: ----------------- - -join [type=] [overwrite=] [max=n] ( | [leftAlias] [rightAlias] (on | where) ) - -* type: optional. Join type using extended syntax. Options: ``left``, ``outer`` (alias of ``left``), ``semi``, ``anti``, and performance sensitive types ``right``, ``full``, ``cross``. **Default:** ``inner``. -* overwrite: optional boolean. Only works with ``join-field-list``. Specifies whether duplicate-named fields from right-dataset should replace corresponding fields in the main search results. **Default:** ``true``. -* max: optional integer. Controls how many subsearch results could be joined against each row in main search. **Default:** 0 (unlimited). -* join-field-list: optional. The fields used to build the join criteria. The join field list must exist on both sides. If not specified, all fields common to both sides will be used as join keys. -* leftAlias: optional. Same as basic syntax when used with extended syntax. -* rightAlias: optional. Same as basic syntax when used with extended syntax. -* joinCriteria: mandatory. Same as basic syntax when used with extended syntax. -* right-dataset: mandatory. Same as basic syntax. - -Configuration -============= - -plugins.ppl.join.subsearch_maxout ---------------------------------- - -The size configures the maximum of rows from subsearch to join against. The default value is: ``50000``. A value of ``0`` indicates that the restriction is unlimited. - -Change the join.subsearch_maxout to 5000:: - - sh$ curl -sS -H 'Content-Type: application/json' \ - ... -X PUT localhost:9200/_plugins/_query/settings \ - ... -d '{"persistent" : {"plugins.ppl.join.subsearch_maxout" : "5000"}}' - { - "acknowledged": true, - "persistent": { - "plugins": { - "ppl": { - "join": { - "subsearch_maxout": "5000" - } - } - } - }, - "transient": {} - } - - -Usage -===== - -Basic join syntax:: - - source = table1 | inner join left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c - source = table1 | inner join left = l right = r where l.a = r.a table2 | fields l.a, r.a, b, c - source = table1 | left join left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c - source = table1 | right join left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c - source = table1 | full left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c - source = table1 | cross join left = l right = r on 1=1 table2 - source = table1 | left semi join left = l right = r on l.a = r.a table2 - source = table1 | left anti join left = l right = r on l.a = r.a table2 - source = table1 | join left = l right = r [ source = table2 | where d > 10 | head 5 ] - source = table1 | inner join on table1.a = table2.a table2 | fields table1.a, table2.a, table1.b, table1.c - source = table1 | inner join on a = c table2 | fields a, b, c, d - source = table1 as t1 | join left = l right = r on l.a = r.a table2 as t2 | fields l.a, r.a - source = table1 as t1 | join left = l right = r on l.a = r.a table2 as t2 | fields t1.a, t2.a - source = table1 | join left = l right = r on l.a = r.a [ source = table2 ] as s | fields l.a, s.a - -Extended syntax with options:: - - source = table1 | join type=outer left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c - source = table1 | join type=left left = l right = r where l.a = r.a table2 | fields l.a, r.a, b, c - source = table1 | join type=inner max=1 left = l right = r where l.a = r.a table2 | fields l.a, r.a, b, c - source = table1 | join a table2 | fields a, b, c - source = table1 | join a, b table2 | fields a, b, c - source = table1 | join type=outer a b table2 | fields a, b, c - source = table1 | join type=inner max=1 a, b table2 | fields a, b, c - source = table1 | join type=left overwrite=false max=0 a, b [source=table2 | rename d as b] | fields a, b, c - -Example 1: Two indices join -=========================== - -This example shows joining two indices using the basic join syntax. - -PPL query:: - - os> source = state_country | inner join left=a right=b ON a.name = b.name occupation | stats avg(salary) by span(age, 10) as age_span, b.country; - fetched rows / total rows = 5/5 - +-------------+----------+-----------+ - | avg(salary) | age_span | b.country | - |-------------+----------+-----------| - | 120000.0 | 40 | USA | - | 105000.0 | 20 | Canada | - | 0.0 | 40 | Canada | - | 70000.0 | 30 | USA | - | 100000.0 | 70 | England | - +-------------+----------+-----------+ - -Example 2: Join with subsearch -============================== - -This example shows joining with a subsearch using the basic join syntax. - -PPL query:: - - PPL> source = state_country as a | where country = 'USA' OR country = 'England' | left join ON a.name = b.name [ source = occupation | where salary > 0 | fields name, country, salary | sort salary | head 3 ] as b | stats avg(salary) by span(age, 10) as age_span, b.country; - fetched rows / total rows = 3/3 - +-------------+----------+-----------+ - | avg(salary) | age_span | b.country | - |-------------+----------+-----------| - | null | 40 | null | - | 70000.0 | 30 | USA | - | 100000.0 | 70 | England | - +-------------+----------+-----------+ - -Example 3: Join with field list -=============================== - -This example shows joining using the extended syntax with field list. - -PPL query:: - - PPL> source = state_country | where country = 'USA' OR country = 'England' | join type=left overwrite=true name [ source = occupation | where salary > 0 | fields name, country, salary | sort salary | head 3 ] | stats avg(salary) by span(age, 10) as age_span, country; - fetched rows / total rows = 3/3 - +-------------+----------+---------+ - | avg(salary) | age_span | country | - |-------------+----------+---------| - | null | 40 | null | - | 70000.0 | 30 | USA | - | 100000.0 | 70 | England | - +-------------+----------+---------+ - -Example 4: Join with options -============================ - -This example shows joining using the extended syntax with additional options. - -PPL query:: - - os> source = state_country | join type=inner overwrite=false max=1 name occupation | stats avg(salary) by span(age, 10) as age_span, country; - fetched rows / total rows = 4/4 - +-------------+----------+---------+ - | avg(salary) | age_span | country | - |-------------+----------+---------| - | 120000.0 | 40 | USA | - | 100000.0 | 70 | USA | - | 105000.0 | 20 | Canada | - | 70000.0 | 30 | USA | - +-------------+----------+---------+ - -Limitations -=========== -For basic syntax, if fields in the left outputs and right outputs have the same name. Typically, in the join criteria -``ON t1.id = t2.id``, the names ``id`` in output are ambiguous. To avoid ambiguous, the ambiguous -fields in output rename to ``.id``, or else ``.id`` if no alias existing. - -Assume table1 and table2 only contain field ``id``, following PPL queries and their outputs are: - -.. list-table:: - :widths: 75 25 - :header-rows: 1 - - * - Query - - Output - * - source=table1 | join left=t1 right=t2 on t1.id=t2.id table2 | eval a = 1 - - t1.id, t2.id, a - * - source=table1 | join on table1.id=table2.id table2 | eval a = 1 - - table1.id, table2.id, a - * - source=table1 | join on table1.id=t2.id table2 as t2 | eval a = 1 - - table1.id, t2.id, a - * - source=table1 | join right=tt on table1.id=t2.id [ source=table2 as t2 | eval b = id ] | eval a = 1 - - table1.id, tt.id, tt.b, a - -| For extended syntax (join with field list), when duplicate-named fields in output results are deduplicated, the fields in output determined by the value of 'overwrite' option. -| Join types ``inner``, ``left``, ``outer`` (alias of ``left``), ``semi`` and ``anti`` are supported by default. ``right``, ``full``, ``cross`` are performance sensitive join types which are disabled by default. Set config ``plugins.calcite.all_join_types.allowed = true`` to enable. diff --git a/docs/user/ppl/cmd/kmeans.md b/docs/user/ppl/cmd/kmeans.md new file mode 100644 index 00000000000..247902804df --- /dev/null +++ b/docs/user/ppl/cmd/kmeans.md @@ -0,0 +1,37 @@ +# kmeans (deprecated by ml command) + +## Description + +The `kmeans` command applies the kmeans algorithm in the ml-commons plugin on the search result returned by a PPL command. +## Syntax + +kmeans \ \ \ +* centroids: optional. The number of clusters you want to group your data points into. **Default:** 2. +* iterations: optional. Number of iterations. **Default:** 10. +* distance_type: optional. The distance type can be COSINE, L1, or EUCLIDEAN. **Default:** EUCLIDEAN. + +## Example: Clustering of Iris Dataset + +This example shows how to classify three Iris species (Iris setosa, Iris virginica and Iris versicolor) based on the combination of four features measured from each sample: the length and the width of the sepals and petals. + +```ppl +source=iris_data +| fields sepal_length_in_cm, sepal_width_in_cm, petal_length_in_cm, petal_width_in_cm +| kmeans centroids=3 +``` + +Expected output: + +```text ++--------------------+-------------------+--------------------+-------------------+-----------+ +| sepal_length_in_cm | sepal_width_in_cm | petal_length_in_cm | petal_width_in_cm | ClusterID | +|--------------------+-------------------+--------------------+-------------------+-----------| +| 5.1 | 3.5 | 1.4 | 0.2 | 1 | +| 5.6 | 3.0 | 4.1 | 1.3 | 0 | +| 6.7 | 2.5 | 5.8 | 1.8 | 2 | ++--------------------+-------------------+--------------------+-------------------+-----------+ +``` + +## Limitations + +The `kmeans` command can only work with `plugins.calcite.enabled=false`. \ No newline at end of file diff --git a/docs/user/ppl/cmd/kmeans.rst b/docs/user/ppl/cmd/kmeans.rst deleted file mode 100644 index ca4ba255c7e..00000000000 --- a/docs/user/ppl/cmd/kmeans.rst +++ /dev/null @@ -1,44 +0,0 @@ -================================= -kmeans (deprecated by ml command) -================================= - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| The ``kmeans`` command applies the kmeans algorithm in the ml-commons plugin on the search result returned by a PPL command. - -Syntax -====== -kmeans - -* centroids: optional. The number of clusters you want to group your data points into. **Default:** 2. -* iterations: optional. Number of iterations. **Default:** 10. -* distance_type: optional. The distance type can be COSINE, L1, or EUCLIDEAN. **Default:** EUCLIDEAN. - - -Example: Clustering of Iris Dataset -=================================== - -This example shows how to classify three Iris species (Iris setosa, Iris virginica and Iris versicolor) based on the combination of four features measured from each sample: the length and the width of the sepals and petals. - -PPL query:: - - > source=iris_data | fields sepal_length_in_cm, sepal_width_in_cm, petal_length_in_cm, petal_width_in_cm | kmeans centroids=3 - +--------------------+-------------------+--------------------+-------------------+-----------+ - | sepal_length_in_cm | sepal_width_in_cm | petal_length_in_cm | petal_width_in_cm | ClusterID | - |--------------------+-------------------+--------------------+-------------------+-----------| - | 5.1 | 3.5 | 1.4 | 0.2 | 1 | - | 5.6 | 3.0 | 4.1 | 1.3 | 0 | - | 6.7 | 2.5 | 5.8 | 1.8 | 2 | - +--------------------+-------------------+--------------------+-------------------+-----------+ - - -Limitations -=========== -The ``kmeans`` command can only work with ``plugins.calcite.enabled=false``. \ No newline at end of file diff --git a/docs/user/ppl/cmd/lookup.md b/docs/user/ppl/cmd/lookup.md new file mode 100644 index 00000000000..03683cdc47b --- /dev/null +++ b/docs/user/ppl/cmd/lookup.md @@ -0,0 +1,339 @@ +# lookup + +## Description + +The `lookup` command enriches your search data by adding or replacing data from a lookup index (dimension table). You can extend fields of an index with values from a dimension table, append or replace values when lookup condition is matched. As an alternative of join command, lookup command is more suitable for enriching the source data with a static dataset. +## Syntax + +lookup \ (\ [as \])... [(replace \| append) (\ [as \])...] +* lookupIndex: mandatory. The name of lookup index (dimension table). +* lookupMappingField: mandatory. A mapping key in `lookupIndex`, analogy to a join key from right table. You can specify multiple `lookupMappingField` with comma-delimited. +* sourceMappingField: optional. A mapping key from source (left side), analogy to a join key from left side. If not specified, defaults to `lookupMappingField`. +* inputField: optional. A field in `lookupIndex` where matched values are applied to result output. You can specify multiple `inputField` with comma-delimited. If not specified, all fields except `lookupMappingField` from `lookupIndex` are applied to result output. +* outputField: optional. A field of output. You can specify zero or multiple `outputField`. If `outputField` has an existing field name in source query, its values will be replaced or appended by matched values from `inputField`. If the field specified in `outputField` is a new field, in replace strategy, an extended new field will be applied to the results, but fail in append strategy. +* replace \| append: optional. The output strategies. If replace, matched values in `lookupIndex` field overwrite the values in result. If append, matched values in `lookupIndex` field only append to the missing values in result. **Default:** replace. + +## Usage + +Lookup + +``` +source = table1 | lookup table2 id +source = table1 | lookup table2 id, name +source = table1 | lookup table2 id as cid, name +source = table1 | lookup table2 id as cid, name replace dept as department +source = table1 | lookup table2 id as cid, name replace dept as department, city as location +source = table1 | lookup table2 id as cid, name append dept as department +source = table1 | lookup table2 id as cid, name append dept as department, city as location +``` + +## Example 1: Replace strategy + +This example shows using the lookup command with the REPLACE strategy to overwrite existing values. + +```bash ignore +curl -H 'Content-Type: application/json' -X POST localhost:9200/_plugins/_ppl -d '{ + "query" : """ + source = worker + | LOOKUP work_information uid AS id REPLACE department + | fields id, name, occupation, country, salary, department + """ +}' +``` + +Result set + +```json +{ + "schema": [ + { + "name": "id", + "type": "integer" + }, + { + "name": "name", + "type": "string" + }, + { + "name": "occupation", + "type": "string" + }, + { + "name": "country", + "type": "string" + }, + { + "name": "salary", + "type": "integer" + }, + { + "name": "department", + "type": "string" + } + ], + "datarows": [ + [ + 1000, + "Jake", + "Engineer", + "England", + 100000, + "IT" + ], + [ + 1001, + "Hello", + "Artist", + "USA", + 70000, + null + ], + [ + 1002, + "John", + "Doctor", + "Canada", + 120000, + "DATA" + ], + [ + 1003, + "David", + "Doctor", + null, + 120000, + "HR" + ], + [ + 1004, + "David", + null, + "Canada", + 0, + null + ], + [ + 1005, + "Jane", + "Scientist", + "Canada", + 90000, + "DATA" + ] + ], + "total": 6, + "size": 6 +} +``` + +## Example 2: Append strategy + +This example shows using the lookup command with the APPEND strategy to fill missing values only. + +```bash ignore +curl -H 'Content-Type: application/json' -X POST localhost:9200/_plugins/_ppl -d '{ + "query" : """ + source = worker + | LOOKUP work_information uid AS id APPEND department + | fields id, name, occupation, country, salary, department + """ +}' +``` + +## Example 3: No inputField specified + +This example shows using the lookup command without specifying inputField, which applies all fields from the lookup index. + +```bash ignore +curl -H 'Content-Type: application/json' -X POST localhost:9200/_plugins/_ppl -d '{ + "query" : """ + source = worker + | LOOKUP work_information uid AS id, name + | fields id, name, occupation, country, salary, department + """ +}' +``` + +Result set + +```json +{ + "schema": [ + { + "name": "id", + "type": "integer" + }, + { + "name": "name", + "type": "string" + }, + { + "name": "country", + "type": "string" + }, + { + "name": "salary", + "type": "integer" + }, + { + "name": "department", + "type": "string" + }, + { + "name": "occupation", + "type": "string" + } + ], + "datarows": [ + [ + 1000, + "Jake", + "England", + 100000, + "IT", + "Engineer" + ], + [ + 1001, + "Hello", + "USA", + 70000, + null, + null + ], + [ + 1002, + "John", + "Canada", + 120000, + "DATA", + "Scientist" + ], + [ + 1003, + "David", + null, + 120000, + "HR", + "Doctor" + ], + [ + 1004, + "David", + "Canada", + 0, + null, + null + ], + [ + 1005, + "Jane", + "Canada", + 90000, + "DATA", + "Engineer" + ] + ], + "total": 6, + "size": 6 +} +``` + +## Example 4: OutputField as a new field + +This example shows using the lookup command with outputField as a new field name. + +```bash ignore +curl -H 'Content-Type: application/json' -X POST localhost:9200/_plugins/_ppl -d '{ + "query" : """ + source = worker + | LOOKUP work_information name REPLACE occupation AS new_col + | fields id, name, occupation, country, salary, new_col + """ +}' +``` + +Result set + +```json +{ + "schema": [ + { + "name": "id", + "type": "integer" + }, + { + "name": "name", + "type": "string" + }, + { + "name": "occupation", + "type": "string" + }, + { + "name": "country", + "type": "string" + }, + { + "name": "salary", + "type": "integer" + }, + { + "name": "new_col", + "type": "string" + } + ], + "datarows": [ + [ + 1003, + "David", + "Doctor", + null, + 120000, + "Doctor" + ], + [ + 1004, + "David", + null, + "Canada", + 0, + "Doctor" + ], + [ + 1001, + "Hello", + "Artist", + "USA", + 70000, + null + ], + [ + 1000, + "Jake", + "Engineer", + "England", + 100000, + "Engineer" + ], + [ + 1005, + "Jane", + "Scientist", + "Canada", + 90000, + "Engineer" + ], + [ + 1002, + "John", + "Doctor", + "Canada", + 120000, + "Scientist" + ] + ], + "total": 6, + "size": 6 +} +``` \ No newline at end of file diff --git a/docs/user/ppl/cmd/lookup.rst b/docs/user/ppl/cmd/lookup.rst deleted file mode 100644 index 4d4cf84a48b..00000000000 --- a/docs/user/ppl/cmd/lookup.rst +++ /dev/null @@ -1,350 +0,0 @@ -====== -lookup -====== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| The ``lookup`` command enriches your search data by adding or replacing data from a lookup index (dimension table). You can extend fields of an index with values from a dimension table, append or replace values when lookup condition is matched. As an alternative of join command, lookup command is more suitable for enriching the source data with a static dataset. - -Syntax -====== -lookup ( [as ])... [(replace | append) ( [as ])...] - -* lookupIndex: mandatory. The name of lookup index (dimension table). -* lookupMappingField: mandatory. A mapping key in ``lookupIndex``, analogy to a join key from right table. You can specify multiple ``lookupMappingField`` with comma-delimited. -* sourceMappingField: optional. A mapping key from source (left side), analogy to a join key from left side. If not specified, defaults to ``lookupMappingField``. -* inputField: optional. A field in ``lookupIndex`` where matched values are applied to result output. You can specify multiple ``inputField`` with comma-delimited. If not specified, all fields except ``lookupMappingField`` from ``lookupIndex`` are applied to result output. -* outputField: optional. A field of output. You can specify zero or multiple ``outputField``. If ``outputField`` has an existing field name in source query, its values will be replaced or appended by matched values from ``inputField``. If the field specified in ``outputField`` is a new field, in replace strategy, an extended new field will be applied to the results, but fail in append strategy. -* replace | append: optional. The output strategies. If replace, matched values in ``lookupIndex`` field overwrite the values in result. If append, matched values in ``lookupIndex`` field only append to the missing values in result. **Default:** replace. - -Usage -===== - -Lookup:: - - source = table1 | lookup table2 id - source = table1 | lookup table2 id, name - source = table1 | lookup table2 id as cid, name - source = table1 | lookup table2 id as cid, name replace dept as department - source = table1 | lookup table2 id as cid, name replace dept as department, city as location - source = table1 | lookup table2 id as cid, name append dept as department - source = table1 | lookup table2 id as cid, name append dept as department, city as location - - -Example 1: Replace strategy -=========================== - -This example shows using the lookup command with the REPLACE strategy to overwrite existing values. - -PPL query:: - - >> curl -H 'Content-Type: application/json' -X POST localhost:9200/_plugins/_ppl -d '{ - "query" : """ - source = worker - | LOOKUP work_information uid AS id REPLACE department - | fields id, name, occupation, country, salary, department - """ - }' - -Result set:: - - { - "schema": [ - { - "name": "id", - "type": "integer" - }, - { - "name": "name", - "type": "string" - }, - { - "name": "occupation", - "type": "string" - }, - { - "name": "country", - "type": "string" - }, - { - "name": "salary", - "type": "integer" - }, - { - "name": "department", - "type": "string" - } - ], - "datarows": [ - [ - 1000, - "Jake", - "Engineer", - "England", - 100000, - "IT" - ], - [ - 1001, - "Hello", - "Artist", - "USA", - 70000, - null - ], - [ - 1002, - "John", - "Doctor", - "Canada", - 120000, - "DATA" - ], - [ - 1003, - "David", - "Doctor", - null, - 120000, - "HR" - ], - [ - 1004, - "David", - null, - "Canada", - 0, - null - ], - [ - 1005, - "Jane", - "Scientist", - "Canada", - 90000, - "DATA" - ] - ], - "total": 6, - "size": 6 - } - -Example 2: Append strategy -========================== - -This example shows using the lookup command with the APPEND strategy to fill missing values only. - -PPL query:: - - >> curl -H 'Content-Type: application/json' -X POST localhost:9200/_plugins/_ppl -d '{ - "query" : """ - source = worker - | LOOKUP work_information uid AS id APPEND department - | fields id, name, occupation, country, salary, department - """ - }' - - -Example 3: No inputField specified -================================== - -This example shows using the lookup command without specifying inputField, which applies all fields from the lookup index. - -PPL query:: - - >> curl -H 'Content-Type: application/json' -X POST localhost:9200/_plugins/_ppl -d '{ - "query" : """ - source = worker - | LOOKUP work_information uid AS id, name - | fields id, name, occupation, country, salary, department - """ - }' - -Result set:: - - { - "schema": [ - { - "name": "id", - "type": "integer" - }, - { - "name": "name", - "type": "string" - }, - { - "name": "country", - "type": "string" - }, - { - "name": "salary", - "type": "integer" - }, - { - "name": "department", - "type": "string" - }, - { - "name": "occupation", - "type": "string" - } - ], - "datarows": [ - [ - 1000, - "Jake", - "England", - 100000, - "IT", - "Engineer" - ], - [ - 1001, - "Hello", - "USA", - 70000, - null, - null - ], - [ - 1002, - "John", - "Canada", - 120000, - "DATA", - "Scientist" - ], - [ - 1003, - "David", - null, - 120000, - "HR", - "Doctor" - ], - [ - 1004, - "David", - "Canada", - 0, - null, - null - ], - [ - 1005, - "Jane", - "Canada", - 90000, - "DATA", - "Engineer" - ] - ], - "total": 6, - "size": 6 - } - -Example 4: OutputField as a new field -===================================== - -This example shows using the lookup command with outputField as a new field name. - -PPL query:: - - >> curl -H 'Content-Type: application/json' -X POST localhost:9200/_plugins/_ppl -d '{ - "query" : """ - source = worker - | LOOKUP work_information name REPLACE occupation AS new_col - | fields id, name, occupation, country, salary, new_col - """ - }' - -Result set:: - - { - "schema": [ - { - "name": "id", - "type": "integer" - }, - { - "name": "name", - "type": "string" - }, - { - "name": "occupation", - "type": "string" - }, - { - "name": "country", - "type": "string" - }, - { - "name": "salary", - "type": "integer" - }, - { - "name": "new_col", - "type": "string" - } - ], - "datarows": [ - [ - 1003, - "David", - "Doctor", - null, - 120000, - "Doctor" - ], - [ - 1004, - "David", - null, - "Canada", - 0, - "Doctor" - ], - [ - 1001, - "Hello", - "Artist", - "USA", - 70000, - null - ], - [ - 1000, - "Jake", - "Engineer", - "England", - 100000, - "Engineer" - ], - [ - 1005, - "Jane", - "Scientist", - "Canada", - 90000, - "Engineer" - ], - [ - 1002, - "John", - "Doctor", - "Canada", - 120000, - "Scientist" - ] - ], - "total": 6, - "size": 6 - } - diff --git a/docs/user/ppl/cmd/ml.md b/docs/user/ppl/cmd/ml.md new file mode 100644 index 00000000000..38098954bfb --- /dev/null +++ b/docs/user/ppl/cmd/ml.md @@ -0,0 +1,153 @@ +# ml + +## Description + +Use the `ml` command to train/predict/train and predict on any algorithm in the ml-commons plugin on the search result returned by a PPL command. +## Syntax + +## AD - Fixed In Time RCF For Time-series Data: + +ml action='train' algorithm='rcf' \ \ \ \ \ \ \ \ \ +* number_of_trees: optional integer. Number of trees in the forest. **Default:** 30. +* shingle_size: optional integer. A shingle is a consecutive sequence of the most recent records. **Default:** 8. +* sample_size: optional integer. The sample size used by stream samplers in this forest. **Default:** 256. +* output_after: optional integer. The number of points required by stream samplers before results are returned. **Default:** 32. +* time_decay: optional double. The decay factor used by stream samplers in this forest. **Default:** 0.0001. +* anomaly_rate: optional double. The anomaly rate. **Default:** 0.005. +* time_field: mandatory string. It specifies the time field for RCF to use as time-series data. +* date_format: optional string. It's used for formatting time_field field. **Default:** "yyyy-MM-dd HH:mm:ss". +* time_zone: optional string. It's used for setting time zone for time_field field. **Default:** UTC. +* category_field: optional string. It specifies the category field used to group inputs. Each category will be independently predicted. + +## AD - Batch RCF for Non-time-series Data: + +ml action='train' algorithm='rcf' \ \ \ \ \ +* number_of_trees: optional integer. Number of trees in the forest. **Default:** 30. +* sample_size: optional integer. Number of random samples given to each tree from the training data set. **Default:** 256. +* output_after: optional integer. The number of points required by stream samplers before results are returned. **Default:** 32. +* training_data_size: optional integer. **Default:** size of your training data set. +* anomaly_score_threshold: optional double. The threshold of anomaly score. **Default:** 1.0. +* category_field: optional string. It specifies the category field used to group inputs. Each category will be independently predicted. + +## KMEANS: + +ml action='train' algorithm='kmeans' \ \ \ +* centroids: optional integer. The number of clusters you want to group your data points into. **Default:** 2. +* iterations: optional integer. Number of iterations. **Default:** 10. +* distance_type: optional string. The distance type can be COSINE, L1, or EUCLIDEAN. **Default:** EUCLIDEAN. + +## Example 1: Detecting events in New York City from taxi ridership data with time-series data + +This example trains an RCF model and uses the model to detect anomalies in the time-series ridership data. + +```ppl +source=nyc_taxi +| fields value, timestamp +| ml action='train' algorithm='rcf' time_field='timestamp' +| where value=10844.0 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------+---------------------+-------+---------------+ +| value | timestamp | score | anomaly_grade | +|---------+---------------------+-------+---------------| +| 10844.0 | 2014-07-01 00:00:00 | 0.0 | 0.0 | ++---------+---------------------+-------+---------------+ +``` + +## Example 2: Detecting events in New York City from taxi ridership data with time-series data independently with each category + +This example trains an RCF model and uses the model to detect anomalies in the time-series ridership data with multiple category values. + +```ppl +source=nyc_taxi +| fields category, value, timestamp +| ml action='train' algorithm='rcf' time_field='timestamp' category_field='category' +| where value=10844.0 or value=6526.0 +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++----------+---------+---------------------+-------+---------------+ +| category | value | timestamp | score | anomaly_grade | +|----------+---------+---------------------+-------+---------------| +| night | 10844.0 | 2014-07-01 00:00:00 | 0.0 | 0.0 | +| day | 6526.0 | 2014-07-01 06:00:00 | 0.0 | 0.0 | ++----------+---------+---------------------+-------+---------------+ +``` + +## Example 3: Detecting events in New York City from taxi ridership data with non-time-series data + +This example trains an RCF model and uses the model to detect anomalies in the non-time-series ridership data. + +```ppl +source=nyc_taxi +| fields value +| ml action='train' algorithm='rcf' +| where value=10844.0 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------+-------+-----------+ +| value | score | anomalous | +|---------+-------+-----------| +| 10844.0 | 0.0 | False | ++---------+-------+-----------+ +``` + +## Example 4: Detecting events in New York City from taxi ridership data with non-time-series data independently with each category + +This example trains an RCF model and uses the model to detect anomalies in the non-time-series ridership data with multiple category values. + +```ppl +source=nyc_taxi +| fields category, value +| ml action='train' algorithm='rcf' category_field='category' +| where value=10844.0 or value=6526.0 +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++----------+---------+-------+-----------+ +| category | value | score | anomalous | +|----------+---------+-------+-----------| +| night | 10844.0 | 0.0 | False | +| day | 6526.0 | 0.0 | False | ++----------+---------+-------+-----------+ +``` + +## Example 5: KMEANS - Clustering of Iris Dataset + +This example shows how to use KMEANS to classify three Iris species (Iris setosa, Iris virginica and Iris versicolor) based on the combination of four features measured from each sample: the length and the width of the sepals and petals. + +```ppl +source=iris_data +| fields sepal_length_in_cm, sepal_width_in_cm, petal_length_in_cm, petal_width_in_cm +| ml action='train' algorithm='kmeans' centroids=3 +``` + +Expected output: + +```text ++--------------------+-------------------+--------------------+-------------------+-----------+ +| sepal_length_in_cm | sepal_width_in_cm | petal_length_in_cm | petal_width_in_cm | ClusterID | +|--------------------+-------------------+--------------------+-------------------+-----------| +| 5.1 | 3.5 | 1.4 | 0.2 | 1 | +| 5.6 | 3.0 | 4.1 | 1.3 | 0 | +| 6.7 | 2.5 | 5.8 | 1.8 | 2 | ++--------------------+-------------------+--------------------+-------------------+-----------+ +``` + +## Limitations + +The `ml` command can only work with `plugins.calcite.enabled=false`. \ No newline at end of file diff --git a/docs/user/ppl/cmd/ml.rst b/docs/user/ppl/cmd/ml.rst deleted file mode 100644 index 371df4de880..00000000000 --- a/docs/user/ppl/cmd/ml.rst +++ /dev/null @@ -1,138 +0,0 @@ -== -ml -== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| Use the ``ml`` command to train/predict/train and predict on any algorithm in the ml-commons plugin on the search result returned by a PPL command. - -Syntax -====== - -AD - Fixed In Time RCF For Time-series Data: --------------------------------------------- - -ml action='train' algorithm='rcf' - -* number_of_trees: optional integer. Number of trees in the forest. **Default:** 30. -* shingle_size: optional integer. A shingle is a consecutive sequence of the most recent records. **Default:** 8. -* sample_size: optional integer. The sample size used by stream samplers in this forest. **Default:** 256. -* output_after: optional integer. The number of points required by stream samplers before results are returned. **Default:** 32. -* time_decay: optional double. The decay factor used by stream samplers in this forest. **Default:** 0.0001. -* anomaly_rate: optional double. The anomaly rate. **Default:** 0.005. -* time_field: mandatory string. It specifies the time field for RCF to use as time-series data. -* date_format: optional string. It's used for formatting time_field field. **Default:** "yyyy-MM-dd HH:mm:ss". -* time_zone: optional string. It's used for setting time zone for time_field field. **Default:** UTC. -* category_field: optional string. It specifies the category field used to group inputs. Each category will be independently predicted. - -AD - Batch RCF for Non-time-series Data: ----------------------------------------- - -ml action='train' algorithm='rcf' - -* number_of_trees: optional integer. Number of trees in the forest. **Default:** 30. -* sample_size: optional integer. Number of random samples given to each tree from the training data set. **Default:** 256. -* output_after: optional integer. The number of points required by stream samplers before results are returned. **Default:** 32. -* training_data_size: optional integer. **Default:** size of your training data set. -* anomaly_score_threshold: optional double. The threshold of anomaly score. **Default:** 1.0. -* category_field: optional string. It specifies the category field used to group inputs. Each category will be independently predicted. - -KMEANS: -------- - -ml action='train' algorithm='kmeans' - -* centroids: optional integer. The number of clusters you want to group your data points into. **Default:** 2. -* iterations: optional integer. Number of iterations. **Default:** 10. -* distance_type: optional string. The distance type can be COSINE, L1, or EUCLIDEAN. **Default:** EUCLIDEAN. - -Example 1: Detecting events in New York City from taxi ridership data with time-series data -=========================================================================================== - -This example trains an RCF model and uses the model to detect anomalies in the time-series ridership data. - -PPL query:: - - os> source=nyc_taxi | fields value, timestamp | ml action='train' algorithm='rcf' time_field='timestamp' | where value=10844.0 - fetched rows / total rows = 1/1 - +---------+---------------------+-------+---------------+ - | value | timestamp | score | anomaly_grade | - |---------+---------------------+-------+---------------| - | 10844.0 | 2014-07-01 00:00:00 | 0.0 | 0.0 | - +---------+---------------------+-------+---------------+ - -Example 2: Detecting events in New York City from taxi ridership data with time-series data independently with each category -============================================================================================================================ - -This example trains an RCF model and uses the model to detect anomalies in the time-series ridership data with multiple category values. - -PPL query:: - - os> source=nyc_taxi | fields category, value, timestamp | ml action='train' algorithm='rcf' time_field='timestamp' category_field='category' | where value=10844.0 or value=6526.0 - fetched rows / total rows = 2/2 - +----------+---------+---------------------+-------+---------------+ - | category | value | timestamp | score | anomaly_grade | - |----------+---------+---------------------+-------+---------------| - | night | 10844.0 | 2014-07-01 00:00:00 | 0.0 | 0.0 | - | day | 6526.0 | 2014-07-01 06:00:00 | 0.0 | 0.0 | - +----------+---------+---------------------+-------+---------------+ - - -Example 3: Detecting events in New York City from taxi ridership data with non-time-series data -=============================================================================================== - -This example trains an RCF model and uses the model to detect anomalies in the non-time-series ridership data. - -PPL query:: - - os> source=nyc_taxi | fields value | ml action='train' algorithm='rcf' | where value=10844.0 - fetched rows / total rows = 1/1 - +---------+-------+-----------+ - | value | score | anomalous | - |---------+-------+-----------| - | 10844.0 | 0.0 | False | - +---------+-------+-----------+ - -Example 4: Detecting events in New York City from taxi ridership data with non-time-series data independently with each category -================================================================================================================================ - -This example trains an RCF model and uses the model to detect anomalies in the non-time-series ridership data with multiple category values. - -PPL query:: - - os> source=nyc_taxi | fields category, value | ml action='train' algorithm='rcf' category_field='category' | where value=10844.0 or value=6526.0 - fetched rows / total rows = 2/2 - +----------+---------+-------+-----------+ - | category | value | score | anomalous | - |----------+---------+-------+-----------| - | night | 10844.0 | 0.0 | False | - | day | 6526.0 | 0.0 | False | - +----------+---------+-------+-----------+ - -Example 5: KMEANS - Clustering of Iris Dataset -=============================================== - -This example shows how to use KMEANS to classify three Iris species (Iris setosa, Iris virginica and Iris versicolor) based on the combination of four features measured from each sample: the length and the width of the sepals and petals. - -PPL query:: - - os> source=iris_data | fields sepal_length_in_cm, sepal_width_in_cm, petal_length_in_cm, petal_width_in_cm | ml action='train' algorithm='kmeans' centroids=3 - +--------------------+-------------------+--------------------+-------------------+-----------+ - | sepal_length_in_cm | sepal_width_in_cm | petal_length_in_cm | petal_width_in_cm | ClusterID | - |--------------------+-------------------+--------------------+-------------------+-----------| - | 5.1 | 3.5 | 1.4 | 0.2 | 1 | - | 5.6 | 3.0 | 4.1 | 1.3 | 0 | - | 6.7 | 2.5 | 5.8 | 1.8 | 2 | - +--------------------+-------------------+--------------------+-------------------+-----------+ - - -Limitations -=========== -The ``ml`` command can only work with ``plugins.calcite.enabled=false``. diff --git a/docs/user/ppl/cmd/multisearch.md b/docs/user/ppl/cmd/multisearch.md new file mode 100644 index 00000000000..0b6e8ae208e --- /dev/null +++ b/docs/user/ppl/cmd/multisearch.md @@ -0,0 +1,152 @@ +# multisearch + +## Description + +Use the `multisearch` command to run multiple search subsearches and merge their results together. The command allows you to combine data from different queries on the same or different sources, and optionally apply subsequent processing to the combined result set. +Key aspects of `multisearch`: +1. Combines results from multiple search operations into a single result set. +2. Each subsearch can have different filtering criteria, data transformations, and field selections. +3. Results are merged and can be further processed with aggregations, sorting, and other PPL commands. +4. Particularly useful for comparative analysis, union operations, and creating comprehensive datasets from multiple search criteria. +5. Supports timestamp-based result interleaving when working with time-series data. + +Use Cases: +* **Comparative Analysis**: Compare metrics across different segments, regions, or time periods +* **Success Rate Monitoring**: Calculate success rates by comparing successful vs. total operations +* **Multi-source Data Combination**: Merge data from different indices or apply different filters to the same source +* **A/B Testing Analysis**: Combine results from different test groups for comparison +* **Time-series Data Merging**: Interleave events from multiple sources based on timestamps + +## Syntax + +multisearch \ \ \ ... +* subsearch1, subsearch2, ...: mandatory. At least two subsearches required. Each subsearch must be enclosed in square brackets and start with the `search` keyword. Format: `[search source=index | commands...]`. All PPL commands are supported within subsearches. +* result-processing: optional. Commands applied to the merged results after the multisearch operation, such as `stats`, `sort`, `head`, etc. + +## Usage + +Basic multisearch + +``` +| multisearch [search source=table | where condition1] [search source=table | where condition2] +| multisearch [search source=index1 | fields field1, field2] [search source=index2 | fields field1, field2] +| multisearch [search source=table | where status="success"] [search source=table | where status="error"] +``` + +## Example 1: Basic Age Group Analysis + +This example combines young and adult customers into a single result set for further analysis. + +```ppl +| multisearch [search source=accounts +| where age < 30 +| eval age_group = "young" +| fields firstname, age, age_group] [search source=accounts +| where age >= 30 +| eval age_group = "adult" +| fields firstname, age, age_group] +| sort age +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------+-----+-----------+ +| firstname | age | age_group | +|-----------+-----+-----------| +| Nanette | 28 | young | +| Amber | 32 | adult | +| Dale | 33 | adult | +| Hattie | 36 | adult | ++-----------+-----+-----------+ +``` + +## Example 2: Success Rate Pattern + +This example combines high-balance and all valid accounts for comparison analysis. + +```ppl +| multisearch [search source=accounts +| where balance > 20000 +| eval query_type = "high_balance" +| fields firstname, balance, query_type] [search source=accounts +| where balance > 0 AND balance <= 20000 +| eval query_type = "regular" +| fields firstname, balance, query_type] +| sort balance desc +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------+---------+--------------+ +| firstname | balance | query_type | +|-----------+---------+--------------| +| Amber | 39225 | high_balance | +| Nanette | 32838 | high_balance | +| Hattie | 5686 | regular | +| Dale | 4180 | regular | ++-----------+---------+--------------+ +``` + +## Example 3: Timestamp Interleaving + +This example combines time-series data from multiple sources with automatic timestamp-based ordering. + +```ppl +| multisearch [search source=time_data +| where category IN ("A", "B")] [search source=time_data2 +| where category IN ("E", "F")] +| fields @timestamp, category, value, timestamp +| head 5 +``` + +Expected output: + +```text +fetched rows / total rows = 5/5 ++---------------------+----------+-------+---------------------+ +| @timestamp | category | value | timestamp | +|---------------------+----------+-------+---------------------| +| 2025-08-01 04:00:00 | E | 2001 | 2025-08-01 04:00:00 | +| 2025-08-01 03:47:41 | A | 8762 | 2025-08-01 03:47:41 | +| 2025-08-01 02:30:00 | F | 2002 | 2025-08-01 02:30:00 | +| 2025-08-01 01:14:11 | B | 9015 | 2025-08-01 01:14:11 | +| 2025-08-01 01:00:00 | E | 2003 | 2025-08-01 01:00:00 | ++---------------------+----------+-------+---------------------+ +``` + +## Example 4: Type Compatibility - Missing Fields + +This example demonstrates how missing fields are handled with NULL insertion. + +```ppl +| multisearch [search source=accounts +| where age < 30 +| eval young_flag = "yes" +| fields firstname, age, young_flag] [search source=accounts +| where age >= 30 +| fields firstname, age] +| sort age +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------+-----+------------+ +| firstname | age | young_flag | +|-----------+-----+------------| +| Nanette | 28 | yes | +| Amber | 32 | null | +| Dale | 33 | null | +| Hattie | 36 | null | ++-----------+-----+------------+ +``` + +## Limitations + +* **Minimum Subsearches**: At least two subsearches must be specified +* **Schema Compatibility**: When fields with the same name exist across subsearches but have incompatible types, the system automatically resolves conflicts by renaming the conflicting fields. The first occurrence retains the original name, while subsequent conflicting fields are renamed with a numeric suffix (e.g., `age` becomes `age0`, `age1`, etc.). This ensures all data is preserved while maintaining schema consistency. \ No newline at end of file diff --git a/docs/user/ppl/cmd/multisearch.rst b/docs/user/ppl/cmd/multisearch.rst deleted file mode 100644 index ed1e092c8af..00000000000 --- a/docs/user/ppl/cmd/multisearch.rst +++ /dev/null @@ -1,126 +0,0 @@ -=========== -multisearch -=========== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| Use the ``multisearch`` command to run multiple search subsearches and merge their results together. The command allows you to combine data from different queries on the same or different sources, and optionally apply subsequent processing to the combined result set. - -| Key aspects of ``multisearch``: - -1. Combines results from multiple search operations into a single result set. -2. Each subsearch can have different filtering criteria, data transformations, and field selections. -3. Results are merged and can be further processed with aggregations, sorting, and other PPL commands. -4. Particularly useful for comparative analysis, union operations, and creating comprehensive datasets from multiple search criteria. -5. Supports timestamp-based result interleaving when working with time-series data. - -| Use Cases: - -* **Comparative Analysis**: Compare metrics across different segments, regions, or time periods -* **Success Rate Monitoring**: Calculate success rates by comparing successful vs. total operations -* **Multi-source Data Combination**: Merge data from different indices or apply different filters to the same source -* **A/B Testing Analysis**: Combine results from different test groups for comparison -* **Time-series Data Merging**: Interleave events from multiple sources based on timestamps - -Syntax -====== -multisearch ... - -* subsearch1, subsearch2, ...: mandatory. At least two subsearches required. Each subsearch must be enclosed in square brackets and start with the ``search`` keyword. Format: ``[search source=index | commands...]``. All PPL commands are supported within subsearches. -* result-processing: optional. Commands applied to the merged results after the multisearch operation, such as ``stats``, ``sort``, ``head``, etc. - -Usage -===== - -Basic multisearch:: - - | multisearch [search source=table | where condition1] [search source=table | where condition2] - | multisearch [search source=index1 | fields field1, field2] [search source=index2 | fields field1, field2] - | multisearch [search source=table | where status="success"] [search source=table | where status="error"] - -Example 1: Basic Age Group Analysis -=================================== - -This example combines young and adult customers into a single result set for further analysis. - -PPL query:: - - os> | multisearch [search source=accounts | where age < 30 | eval age_group = "young" | fields firstname, age, age_group] [search source=accounts | where age >= 30 | eval age_group = "adult" | fields firstname, age, age_group] | sort age; - fetched rows / total rows = 4/4 - +-----------+-----+-----------+ - | firstname | age | age_group | - |-----------+-----+-----------| - | Nanette | 28 | young | - | Amber | 32 | adult | - | Dale | 33 | adult | - | Hattie | 36 | adult | - +-----------+-----+-----------+ - -Example 2: Success Rate Pattern -=============================== - -This example combines high-balance and all valid accounts for comparison analysis. - -PPL query:: - - os> | multisearch [search source=accounts | where balance > 20000 | eval query_type = "high_balance" | fields firstname, balance, query_type] [search source=accounts | where balance > 0 AND balance <= 20000 | eval query_type = "regular" | fields firstname, balance, query_type] | sort balance desc; - fetched rows / total rows = 4/4 - +-----------+---------+--------------+ - | firstname | balance | query_type | - |-----------+---------+--------------| - | Amber | 39225 | high_balance | - | Nanette | 32838 | high_balance | - | Hattie | 5686 | regular | - | Dale | 4180 | regular | - +-----------+---------+--------------+ - -Example 3: Timestamp Interleaving -================================= - -This example combines time-series data from multiple sources with automatic timestamp-based ordering. - -PPL query:: - - os> | multisearch [search source=time_data | where category IN ("A", "B")] [search source=time_data2 | where category IN ("E", "F")] | fields @timestamp, category, value, timestamp | head 5; - fetched rows / total rows = 5/5 - +---------------------+----------+-------+---------------------+ - | @timestamp | category | value | timestamp | - |---------------------+----------+-------+---------------------| - | 2025-08-01 04:00:00 | E | 2001 | 2025-08-01 04:00:00 | - | 2025-08-01 03:47:41 | A | 8762 | 2025-08-01 03:47:41 | - | 2025-08-01 02:30:00 | F | 2002 | 2025-08-01 02:30:00 | - | 2025-08-01 01:14:11 | B | 9015 | 2025-08-01 01:14:11 | - | 2025-08-01 01:00:00 | E | 2003 | 2025-08-01 01:00:00 | - +---------------------+----------+-------+---------------------+ - -Example 4: Type Compatibility - Missing Fields -================================================= - -This example demonstrates how missing fields are handled with NULL insertion. - -PPL query:: - - os> | multisearch [search source=accounts | where age < 30 | eval young_flag = "yes" | fields firstname, age, young_flag] [search source=accounts | where age >= 30 | fields firstname, age] | sort age; - fetched rows / total rows = 4/4 - +-----------+-----+------------+ - | firstname | age | young_flag | - |-----------+-----+------------| - | Nanette | 28 | yes | - | Amber | 32 | null | - | Dale | 33 | null | - | Hattie | 36 | null | - +-----------+-----+------------+ - - -Limitations -=========== - -* **Minimum Subsearches**: At least two subsearches must be specified -* **Schema Compatibility**: When fields with the same name exist across subsearches but have incompatible types, the system automatically resolves conflicts by renaming the conflicting fields. The first occurrence retains the original name, while subsequent conflicting fields are renamed with a numeric suffix (e.g., ``age`` becomes ``age0``, ``age1``, etc.). This ensures all data is preserved while maintaining schema consistency. diff --git a/docs/user/ppl/cmd/parse.md b/docs/user/ppl/cmd/parse.md new file mode 100644 index 00000000000..8e151ad888b --- /dev/null +++ b/docs/user/ppl/cmd/parse.md @@ -0,0 +1,133 @@ +# parse + +## Description + +The `parse` command parses a text field with a regular expression and appends the result to the search result. +## Syntax + +parse \ \ +* field: mandatory. The field must be a text field. +* pattern: mandatory. The regular expression pattern used to extract new fields from the given text field. If a new field name already exists, it will replace the original field. + +## Regular Expression + +The regular expression pattern is used to match the whole text field of each document with Java regex engine. Each named capture group in the expression will become a new `STRING` field. +## Example 1: Create a new field + +This example shows how to create a new field `host` for each document. `host` will be the host name after `@` in `email` field. Parsing a null field will return an empty string. + +```ppl +source=accounts +| parse email '.+@(?.+)' +| fields email, host +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------------------+------------+ +| email | host | +|-----------------------+------------| +| amberduke@pyrami.com | pyrami.com | +| hattiebond@netagy.com | netagy.com | +| null | | +| daleadams@boink.com | boink.com | ++-----------------------+------------+ +``` + +## Example 2: Override an existing field + +This example shows how to override the existing `address` field with street number removed. + +```ppl +source=accounts +| parse address '\d+ (?
.+)' +| fields address +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++------------------+ +| address | +|------------------| +| Holmes Lane | +| Bristol Street | +| Madison Street | +| Hutchinson Court | ++------------------+ +``` + +## Example 3: Filter and sort by casted parsed field + +This example shows how to sort street numbers that are higher than 500 in `address` field. + +```ppl +source=accounts +| parse address '(?\d+) (?.+)' +| where cast(streetNumber as int) > 500 +| sort num(streetNumber) +| fields streetNumber, street +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++--------------+----------------+ +| streetNumber | street | +|--------------+----------------| +| 671 | Bristol Street | +| 789 | Madison Street | +| 880 | Holmes Lane | ++--------------+----------------+ +``` + +## Limitations + +There are a few limitations with parse command: +- Fields defined by parse cannot be parsed again. + +The following command will not work + +``` +source=accounts | parse address '\d+ (?.+)' | parse street '\w+ (?\w+)' ; +``` + +- Fields defined by parse cannot be overridden with other commands. + +`where` will not match any documents since `street` cannot be overridden + +``` +source=accounts | parse address '\d+ (?.+)' | eval street='1' | where street='1' ; +``` + +- The text field used by parse cannot be overridden. + +`street` will not be successfully parsed since `address` is overridden + +``` +source=accounts | parse address '\d+ (?.+)' | eval address='1' ; +``` + +- Fields defined by parse cannot be filtered/sorted after using them in `stats` command. + +`where` in the following command will not work + +``` +source=accounts | parse email '.+@(?.+)' | stats avg(age) by host | where host=pyrami.com ; +``` + +- Fields defined by parse will not appear in the final result unless the original source field is included in the `fields` command. + +For example, the following query will not display the parsed fields `host` unless the source field `email` is also explicitly included + +``` +source=accounts | parse email '.+@(?.+)' | fields email, host ; +``` + +- Named capture group must start with a letter and contain only letters and digits. + + For detailed Java regex pattern syntax and usage, refer to the [official Java Pattern documentation](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html) \ No newline at end of file diff --git a/docs/user/ppl/cmd/parse.rst b/docs/user/ppl/cmd/parse.rst deleted file mode 100644 index 833736238b9..00000000000 --- a/docs/user/ppl/cmd/parse.rst +++ /dev/null @@ -1,119 +0,0 @@ -===== -parse -===== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| The ``parse`` command parses a text field with a regular expression and appends the result to the search result. - - -Syntax -====== -parse - -* field: mandatory. The field must be a text field. -* pattern: mandatory. The regular expression pattern used to extract new fields from the given text field. If a new field name already exists, it will replace the original field. - -Regular Expression -================== -The regular expression pattern is used to match the whole text field of each document with Java regex engine. Each named capture group in the expression will become a new ``STRING`` field. - -Example 1: Create a new field -============================= - -This example shows how to create a new field ``host`` for each document. ``host`` will be the host name after ``@`` in ``email`` field. Parsing a null field will return an empty string. - -PPL query:: - - os> source=accounts | parse email '.+@(?.+)' | fields email, host ; - fetched rows / total rows = 4/4 - +-----------------------+------------+ - | email | host | - |-----------------------+------------| - | amberduke@pyrami.com | pyrami.com | - | hattiebond@netagy.com | netagy.com | - | null | | - | daleadams@boink.com | boink.com | - +-----------------------+------------+ - - -Example 2: Override an existing field -===================================== - -This example shows how to override the existing ``address`` field with street number removed. - -PPL query:: - - os> source=accounts | parse address '\d+ (?
.+)' | fields address ; - fetched rows / total rows = 4/4 - +------------------+ - | address | - |------------------| - | Holmes Lane | - | Bristol Street | - | Madison Street | - | Hutchinson Court | - +------------------+ - -Example 3: Filter and sort by casted parsed field -================================================= - -This example shows how to sort street numbers that are higher than 500 in ``address`` field. - -PPL query:: - - os> source=accounts | parse address '(?\d+) (?.+)' | where cast(streetNumber as int) > 500 | sort num(streetNumber) | fields streetNumber, street ; - fetched rows / total rows = 3/3 - +--------------+----------------+ - | streetNumber | street | - |--------------+----------------| - | 671 | Bristol Street | - | 789 | Madison Street | - | 880 | Holmes Lane | - +--------------+----------------+ - -Limitations -=========== - -There are a few limitations with parse command: - -- Fields defined by parse cannot be parsed again. - - The following command will not work:: - - source=accounts | parse address '\d+ (?.+)' | parse street '\w+ (?\w+)' ; - -- Fields defined by parse cannot be overridden with other commands. - - ``where`` will not match any documents since ``street`` cannot be overridden:: - - source=accounts | parse address '\d+ (?.+)' | eval street='1' | where street='1' ; - -- The text field used by parse cannot be overridden. - - ``street`` will not be successfully parsed since ``address`` is overridden:: - - source=accounts | parse address '\d+ (?.+)' | eval address='1' ; - -- Fields defined by parse cannot be filtered/sorted after using them in ``stats`` command. - - ``where`` in the following command will not work:: - - source=accounts | parse email '.+@(?.+)' | stats avg(age) by host | where host=pyrami.com ; - -- Fields defined by parse will not appear in the final result unless the original source field is included in the ``fields`` command. - - For example, the following query will not display the parsed fields ``host`` unless the source field ``email`` is also explicitly included:: - - source=accounts | parse email '.+@(?.+)' | fields email, host ; - -- Named capture group must start with a letter and contain only letters and digits. - - For detailed Java regex pattern syntax and usage, refer to the `official Java Pattern documentation `_ diff --git a/docs/user/ppl/cmd/patterns.md b/docs/user/ppl/cmd/patterns.md new file mode 100644 index 00000000000..7b9cb718891 --- /dev/null +++ b/docs/user/ppl/cmd/patterns.md @@ -0,0 +1,260 @@ +# patterns + +## Description + +The `patterns` command extracts log patterns from a text field and appends the results to the search result. Grouping logs by their patterns makes it easier to aggregate stats from large volumes of log data for analysis and troubleshooting. +`patterns` command allows users to select different log parsing algorithms to get high log pattern grouping accuracy. Two pattern methods are supported: `simple_pattern` and `brain`. +`simple_pattern` algorithm is basically a regex parsing method vs `brain` algorithm is an automatic log grouping algorithm with high grouping accuracy and keeps semantic meaning. +`patterns` command supports two modes: `label` and `aggregation`. `label` mode returns individual pattern labels. `aggregation` mode returns aggregated results on target field. +Calcite engine by default labels the variables with '\<*\>' placeholder. If `show_numbered_token` option is turned on, Calcite engine's `label` mode not only labels pattern of text but also labels variable tokens in map. In `aggregation` mode, it will also output labeled pattern as well as variable tokens per pattern. The variable placeholder is in the format of '' instead of '<\*>'. + +## Syntax + +patterns \ [by byClause...] [method=simple_pattern \| brain] [mode=label \| aggregation] [max_sample_count=integer] [buffer_limit=integer] [show_numbered_token=boolean] [new_field=\] (algorithm parameters...) +* field: mandatory. The text field to analyze for patterns. +* byClause: optional. Fields or scalar functions used to group logs for labeling/aggregation. +* method: optional. Algorithm choice: `simple_pattern` or `brain`. **Default:** `simple_pattern`. +* mode: optional. Output mode: `label` or `aggregation`. **Default:** `label`. +* max_sample_count: optional. Max sample logs returned per pattern in aggregation mode. **Default:** 10. +* buffer_limit: optional. Safeguard parameter for `brain` algorithm to limit internal temporary buffer size (min: 50,000). **Default:** 100,000. +* show_numbered_token: optional. The flag to turn on numbered token output format. **Default:** false. +* new_field: optional. Alias of the output pattern field. **Default:** "patterns_field". +* algorithm parameters: optional. Algorithm-specific tuning: + * `simple_pattern`: Define regex via "pattern". + * `brain`: Adjust sensitivity with variable_count_threshold and frequency_threshold_percentage. + * `variable_count_threshold`: optional integer. Words are split by space. Algorithm counts how many distinct words are at specific position in initial log groups. Adjusting this threshold can determine the sensitivity of constant words. **Default:** 5. + * `frequency_threshold_percentage`: optional double. Brain's log pattern is selected based on longest word combination. This sets the lower bound of frequency to ignore low frequency words. **Default:** 0.3. + +## Change the default pattern method + +To override default pattern parameters, users can run following command + +``` + PUT _cluster/settings + { + "persistent": { + "plugins.ppl.pattern.method": "brain", + "plugins.ppl.pattern.mode": "aggregation", + "plugins.ppl.pattern.max.sample.count": 5, + "plugins.ppl.pattern.buffer.limit": 50000, + "plugins.ppl.pattern.show.numbered.token": true + } + } +``` + +## Simple Pattern Example 1: Create the new field + +This example shows how to extract patterns in `email` for each document. Parsing a null field will return an empty string. + +```ppl +source=accounts +| patterns email method=simple_pattern +| fields email, patterns_field +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------------------+----------------+ +| email | patterns_field | +|-----------------------+----------------| +| amberduke@pyrami.com | <*>@<*>.<*> | +| hattiebond@netagy.com | <*>@<*>.<*> | +| null | | +| daleadams@boink.com | <*>@<*>.<*> | ++-----------------------+----------------+ +``` + +## Simple Pattern Example 2: Extract log patterns + +This example shows how to extract patterns from a raw log field using the default patterns. + +```ppl +source=apache +| patterns message method=simple_pattern +| fields message, patterns_field +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ +| message | patterns_field | +|-----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------| +| 177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927 | <*>.<*>.<*>.<*> - <*> [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*>-<*>/<*> <*>/<*>.<*>" <*> <*> | +| 127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722 | <*>.<*>.<*>.<*> - <*> [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*>/<*>/<*>/<*> <*>/<*>.<*>" <*> <*> | +| 118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439 | <*>.<*>.<*>.<*> - - [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*>/<*>-<*>-<*>-<*> <*>/<*>.<*>" <*> <*> | +| 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | <*>.<*>.<*>.<*> - - [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*> <*>/<*>.<*>" <*> <*> | ++-----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ +``` + +## Simple Pattern Example 3: Extract log patterns with custom regex pattern + +This example shows how to extract patterns from a raw log field using user defined patterns. + +```ppl +source=apache +| patterns message method=simple_pattern new_field='no_numbers' pattern='[0-9]' +| fields message, no_numbers +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| message | no_numbers | +|-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| 177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927 | <*><*><*>.<*><*>.<*>.<*><*> - upton<*><*><*><*> [<*><*>/Sep/<*><*><*><*>:<*><*>:<*><*>:<*><*> -<*><*><*><*>] "HEAD /e-business/mindshare HTTP/<*>.<*>" <*><*><*> <*><*><*><*><*> | +| 127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722 | <*><*><*>.<*><*>.<*><*><*>.<*> - pouros<*><*><*><*> [<*><*>/Sep/<*><*><*><*>:<*><*>:<*><*>:<*><*> -<*><*><*><*>] "GET /architectures/convergence/niches/mindshare HTTP/<*>.<*>" <*><*><*> <*><*><*><*><*> | +| 118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439 | <*><*><*>.<*><*><*>.<*><*><*>.<*><*><*> - - [<*><*>/Sep/<*><*><*><*>:<*><*>:<*><*>:<*><*> -<*><*><*><*>] "PATCH /strategize/out-of-the-box HTTP/<*>.<*>" <*><*><*> <*><*><*><*><*> | +| 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | <*><*><*>.<*><*><*>.<*><*>.<*><*><*> - - [<*><*>/Sep/<*><*><*><*>:<*><*>:<*><*>:<*><*> -<*><*><*><*>] "POST /users HTTP/<*>.<*>" <*><*><*> <*><*><*><*> | ++-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +``` + +## Simple Pattern Example 4: Return log patterns aggregation result + +This example shows how to get aggregated results from a raw log field. + +```ppl +source=apache +| patterns message method=simple_pattern mode=aggregation +| fields patterns_field, pattern_count, sample_logs +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++---------------------------------------------------------------------------------------------------+---------------+-------------------------------------------------------------------------------------------------------------------------------+ +| patterns_field | pattern_count | sample_logs | +|---------------------------------------------------------------------------------------------------+---------------+-------------------------------------------------------------------------------------------------------------------------------| +| <*>.<*>.<*>.<*> - - [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*> <*>/<*>.<*>" <*> <*> | 1 | [210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481] | +| <*>.<*>.<*>.<*> - - [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*>/<*>-<*>-<*>-<*> <*>/<*>.<*>" <*> <*> | 1 | [118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439] | +| <*>.<*>.<*>.<*> - <*> [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*>-<*>/<*> <*>/<*>.<*>" <*> <*> | 1 | [177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927] | +| <*>.<*>.<*>.<*> - <*> [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*>/<*>/<*>/<*> <*>/<*>.<*>" <*> <*> | 1 | [127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722] | ++---------------------------------------------------------------------------------------------------+---------------+-------------------------------------------------------------------------------------------------------------------------------+ +``` + +## Simple Pattern Example 5: Return log patterns aggregation result with detected variable tokens + +This example shows how to get aggregated results with detected variable tokens. +## Configuration + +With option `show_numbered_token` enabled, the output can detect numbered variable tokens from the pattern field. + +```ppl +source=apache +| patterns message method=simple_pattern mode=aggregation show_numbered_token=true +| fields patterns_field, pattern_count, tokens +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| patterns_field | pattern_count | tokens | +|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| ... - - [//::: -] " / /." | 1 | {'': ['HTTP'], '': ['users'], '': ['1'], '': ['1'], '': ['9481'], '': ['301'], '': ['28'], '': ['104'], '': ['2022'], '': ['Sep'], '': ['15'], '': ['10'], '': ['57'], '': ['210'], '': ['POST'], '': ['15'], '': ['0700'], '': ['204']} | ++--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +``` + +## Brain Example 1: Extract log patterns + +This example shows how to extract semantic meaningful log patterns from a raw log field using the brain algorithm. The default variable count threshold is 5. + +```ppl +source=apache +| patterns message method=brain +| fields message, patterns_field +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+ +| message | patterns_field | +|-----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------| +| 177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] "HEAD /e-business/mindshare HTTP/<*>" 404 <*> | +| 127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] "GET /architectures/convergence/niches/mindshare HTTP/<*>" 100 <*> | +| 118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439 | <*IP*> - - [<*>/Sep/<*>:<*>:<*>:<*> <*>] "PATCH /strategize/out-of-the-box HTTP/<*>" 401 <*> | +| 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | <*IP*> - - [<*>/Sep/<*>:<*>:<*>:<*> <*>] "POST /users HTTP/<*>" 301 <*> | ++-----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+ +``` + +## Brain Example 2: Extract log patterns with custom parameters + +This example shows how to extract semantic meaningful log patterns from a raw log field using custom parameters of the brain algorithm. + +```ppl +source=apache +| patterns message method=brain variable_count_threshold=2 +| fields message, patterns_field +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------+ +| message | patterns_field | +|-----------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------| +| 177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*>" <*> <*> | +| 127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*>" <*> <*> | +| 118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*>" <*> <*> | +| 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*>" <*> <*> | ++-----------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------+ +``` + +## Brain Example 3: Return log patterns aggregation result + +This example shows how to get aggregated results from a raw log field using the brain algorithm. + +```ppl +source=apache +| patterns message method=brain mode=aggregation variable_count_threshold=2 +| fields patterns_field, pattern_count, sample_logs +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------------------------------------------------------------+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| patterns_field | pattern_count | sample_logs | +|----------------------------------------------------------------------+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*>" <*> <*> | 4 | [177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927,127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722,118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439,210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481] | ++----------------------------------------------------------------------+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +``` + +## Brain Example 4: Return log patterns aggregation result with detected variable tokens + +This example shows how to get aggregated results with detected variable tokens using the brain algorithm. + +With option `show_numbered_token` enabled, the output can detect numbered variable tokens from the pattern field. + +```ppl +source=apache +| patterns message method=brain mode=aggregation show_numbered_token=true variable_count_threshold=2 +| fields patterns_field, pattern_count, tokens +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------------------------------------------------------------------------------------------------------------------------------+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| patterns_field | pattern_count | tokens | +|----------------------------------------------------------------------------------------------------------------------------------------+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| - [/Sep/::: ] HTTP/" | 4 | {'': ['19927', '28722', '27439', '9481'], '': ['10', '10', '10', '10'], '': ['2022', '2022', '2022', '2022'], '': ['57', '57', '57', '57'], '': ['15', '15', '15', '15'], '': ['"HEAD', '"GET', '"PATCH', '"POST'], '': ['-0700', '-0700', '-0700', '-0700'], '': ['/e-business/mindshare', '/architectures/convergence/niches/mindshare', '/strategize/out-of-the-box', '/users'], '': ['177.95.8.74', '127.45.152.6', '118.223.210.10... | ++----------------------------------------------------------------------------------------------------------------------------------------+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +``` + +## Limitations + +- Patterns command is not pushed down to OpenSearch data node for now. It will only group log patterns on log messages returned to coordinator node. \ No newline at end of file diff --git a/docs/user/ppl/cmd/patterns.rst b/docs/user/ppl/cmd/patterns.rst deleted file mode 100644 index ec87aca7494..00000000000 --- a/docs/user/ppl/cmd/patterns.rst +++ /dev/null @@ -1,225 +0,0 @@ -======== -patterns -======== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| The ``patterns`` command extracts log patterns from a text field and appends the results to the search result. Grouping logs by their patterns makes it easier to aggregate stats from large volumes of log data for analysis and troubleshooting. - -| ``patterns`` command allows users to select different log parsing algorithms to get high log pattern grouping accuracy. Two pattern methods are supported: ``simple_pattern`` and ``brain``. - -| ``simple_pattern`` algorithm is basically a regex parsing method vs ``brain`` algorithm is an automatic log grouping algorithm with high grouping accuracy and keeps semantic meaning. - -| ``patterns`` command supports two modes: ``label`` and ``aggregation``. ``label`` mode returns individual pattern labels. ``aggregation`` mode returns aggregated results on target field. - -| Calcite engine by default labels the variables with '<*>' placeholder. If ``show_numbered_token`` option is turned on, Calcite engine's ``label`` mode not only labels pattern of text but also labels variable tokens in map. In ``aggregation`` mode, it will also output labeled pattern as well as variable tokens per pattern. The variable placeholder is in the format of '' instead of '<*>'. - -Syntax -====== -patterns [by byClause...] [method=simple_pattern | brain] [mode=label | aggregation] [max_sample_count=integer] [buffer_limit=integer] [show_numbered_token=boolean] [new_field=] (algorithm parameters...) - -* field: mandatory. The text field to analyze for patterns. -* byClause: optional. Fields or scalar functions used to group logs for labeling/aggregation. -* method: optional. Algorithm choice: ``simple_pattern`` or ``brain``. **Default:** ``simple_pattern``. -* mode: optional. Output mode: ``label`` or ``aggregation``. **Default:** ``label``. -* max_sample_count: optional. Max sample logs returned per pattern in aggregation mode. **Default:** 10. -* buffer_limit: optional. Safeguard parameter for ``brain`` algorithm to limit internal temporary buffer size (min: 50,000). **Default:** 100,000. -* show_numbered_token: optional. The flag to turn on numbered token output format. **Default:** false. -* new_field: optional. Alias of the output pattern field. **Default:** "patterns_field". -* algorithm parameters: optional. Algorithm-specific tuning: - - * ``simple_pattern``: Define regex via "pattern". - * ``brain``: Adjust sensitivity with variable_count_threshold and frequency_threshold_percentage. - - * ``variable_count_threshold``: optional integer. Words are split by space. Algorithm counts how many distinct words are at specific position in initial log groups. Adjusting this threshold can determine the sensitivity of constant words. **Default:** 5. - * ``frequency_threshold_percentage``: optional double. Brain's log pattern is selected based on longest word combination. This sets the lower bound of frequency to ignore low frequency words. **Default:** 0.3. - -Change the default pattern method -================================= -To override default pattern parameters, users can run following command - -.. code-block:: - - PUT _cluster/settings - { - "persistent": { - "plugins.ppl.pattern.method": "brain", - "plugins.ppl.pattern.mode": "aggregation", - "plugins.ppl.pattern.max.sample.count": 5, - "plugins.ppl.pattern.buffer.limit": 50000, - "plugins.ppl.pattern.show.numbered.token": true - } - } - -Simple Pattern Example 1: Create the new field -============================================== - -This example shows how to extract patterns in ``email`` for each document. Parsing a null field will return an empty string. - -PPL query:: - - os> source=accounts | patterns email method=simple_pattern | fields email, patterns_field ; - fetched rows / total rows = 4/4 - +-----------------------+----------------+ - | email | patterns_field | - |-----------------------+----------------| - | amberduke@pyrami.com | <*>@<*>.<*> | - | hattiebond@netagy.com | <*>@<*>.<*> | - | null | | - | daleadams@boink.com | <*>@<*>.<*> | - +-----------------------+----------------+ - -Simple Pattern Example 2: Extract log patterns -============================================== - -This example shows how to extract patterns from a raw log field using the default patterns. - -PPL query:: - - os> source=apache | patterns message method=simple_pattern | fields message, patterns_field ; - fetched rows / total rows = 4/4 - +-----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ - | message | patterns_field | - |-----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------| - | 177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927 | <*>.<*>.<*>.<*> - <*> [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*>-<*>/<*> <*>/<*>.<*>" <*> <*> | - | 127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722 | <*>.<*>.<*>.<*> - <*> [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*>/<*>/<*>/<*> <*>/<*>.<*>" <*> <*> | - | 118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439 | <*>.<*>.<*>.<*> - - [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*>/<*>-<*>-<*>-<*> <*>/<*>.<*>" <*> <*> | - | 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | <*>.<*>.<*>.<*> - - [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*> <*>/<*>.<*>" <*> <*> | - +-----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+ - -Simple Pattern Example 3: Extract log patterns with custom regex pattern -======================================================================== - -This example shows how to extract patterns from a raw log field using user defined patterns. - -PPL query:: - - os> source=apache | patterns message method=simple_pattern new_field='no_numbers' pattern='[0-9]' | fields message, no_numbers ; - fetched rows / total rows = 4/4 - +-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | message | no_numbers | - |-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| - | 177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927 | <*><*><*>.<*><*>.<*>.<*><*> - upton<*><*><*><*> [<*><*>/Sep/<*><*><*><*>:<*><*>:<*><*>:<*><*> -<*><*><*><*>] "HEAD /e-business/mindshare HTTP/<*>.<*>" <*><*><*> <*><*><*><*><*> | - | 127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722 | <*><*><*>.<*><*>.<*><*><*>.<*> - pouros<*><*><*><*> [<*><*>/Sep/<*><*><*><*>:<*><*>:<*><*>:<*><*> -<*><*><*><*>] "GET /architectures/convergence/niches/mindshare HTTP/<*>.<*>" <*><*><*> <*><*><*><*><*> | - | 118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439 | <*><*><*>.<*><*><*>.<*><*><*>.<*><*><*> - - [<*><*>/Sep/<*><*><*><*>:<*><*>:<*><*>:<*><*> -<*><*><*><*>] "PATCH /strategize/out-of-the-box HTTP/<*>.<*>" <*><*><*> <*><*><*><*><*> | - | 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | <*><*><*>.<*><*><*>.<*><*>.<*><*><*> - - [<*><*>/Sep/<*><*><*><*>:<*><*>:<*><*>:<*><*> -<*><*><*><*>] "POST /users HTTP/<*>.<*>" <*><*><*> <*><*><*><*> | - +-----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - -Simple Pattern Example 4: Return log patterns aggregation result -================================================================ - -This example shows how to get aggregated results from a raw log field. - -PPL query:: - - os> source=apache | patterns message method=simple_pattern mode=aggregation | fields patterns_field, pattern_count, sample_logs ; - fetched rows / total rows = 4/4 - +---------------------------------------------------------------------------------------------------+---------------+-------------------------------------------------------------------------------------------------------------------------------+ - | patterns_field | pattern_count | sample_logs | - |---------------------------------------------------------------------------------------------------+---------------+-------------------------------------------------------------------------------------------------------------------------------| - | <*>.<*>.<*>.<*> - - [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*> <*>/<*>.<*>" <*> <*> | 1 | [210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481] | - | <*>.<*>.<*>.<*> - - [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*>/<*>-<*>-<*>-<*> <*>/<*>.<*>" <*> <*> | 1 | [118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439] | - | <*>.<*>.<*>.<*> - <*> [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*>-<*>/<*> <*>/<*>.<*>" <*> <*> | 1 | [177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927] | - | <*>.<*>.<*>.<*> - <*> [<*>/<*>/<*>:<*>:<*>:<*> -<*>] "<*> /<*>/<*>/<*>/<*> <*>/<*>.<*>" <*> <*> | 1 | [127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722] | - +---------------------------------------------------------------------------------------------------+---------------+-------------------------------------------------------------------------------------------------------------------------------+ - -Simple Pattern Example 5: Return log patterns aggregation result with detected variable tokens -============================================================================================== - -This example shows how to get aggregated results with detected variable tokens. - -Configuration -------------- -With option ``show_numbered_token`` enabled, the output can detect numbered variable tokens from the pattern field. - -PPL query:: - - os> source=apache | patterns message method=simple_pattern mode=aggregation show_numbered_token=true | fields patterns_field, pattern_count, tokens | head 1 ; - fetched rows / total rows = 1/1 - +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | patterns_field | pattern_count | tokens | - |--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| - | ... - - [//::: -] " / /." | 1 | {'': ['HTTP'], '': ['users'], '': ['1'], '': ['1'], '': ['9481'], '': ['301'], '': ['28'], '': ['104'], '': ['2022'], '': ['Sep'], '': ['15'], '': ['10'], '': ['57'], '': ['210'], '': ['POST'], '': ['15'], '': ['0700'], '': ['204']} | - +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - -Brain Example 1: Extract log patterns -===================================== - -This example shows how to extract semantic meaningful log patterns from a raw log field using the brain algorithm. The default variable count threshold is 5. - -PPL query:: - - os> source=apache | patterns message method=brain | fields message, patterns_field ; - fetched rows / total rows = 4/4 - +-----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+ - | message | patterns_field | - |-----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------| - | 177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] "HEAD /e-business/mindshare HTTP/<*>" 404 <*> | - | 127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] "GET /architectures/convergence/niches/mindshare HTTP/<*>" 100 <*> | - | 118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439 | <*IP*> - - [<*>/Sep/<*>:<*>:<*>:<*> <*>] "PATCH /strategize/out-of-the-box HTTP/<*>" 401 <*> | - | 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | <*IP*> - - [<*>/Sep/<*>:<*>:<*>:<*> <*>] "POST /users HTTP/<*>" 301 <*> | - +-----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+ - -Brain Example 2: Extract log patterns with custom parameters -============================================================ - -This example shows how to extract semantic meaningful log patterns from a raw log field using custom parameters of the brain algorithm. - -PPL query:: - - os> source=apache | patterns message method=brain variable_count_threshold=2 | fields message, patterns_field ; - fetched rows / total rows = 4/4 - +-----------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------+ - | message | patterns_field | - |-----------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------| - | 177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*>" <*> <*> | - | 127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*>" <*> <*> | - | 118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*>" <*> <*> | - | 210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481 | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*>" <*> <*> | - +-----------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------+ - -Brain Example 3: Return log patterns aggregation result -======================================================= - -This example shows how to get aggregated results from a raw log field using the brain algorithm. - -PPL query:: - - os> source=apache | patterns message method=brain mode=aggregation variable_count_threshold=2 | fields patterns_field, pattern_count, sample_logs ; - fetched rows / total rows = 1/1 - +----------------------------------------------------------------------+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | patterns_field | pattern_count | sample_logs | - |----------------------------------------------------------------------+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| - | <*IP*> - <*> [<*>/Sep/<*>:<*>:<*>:<*> <*>] <*> <*> HTTP/<*>" <*> <*> | 4 | [177.95.8.74 - upton5450 [28/Sep/2022:10:15:57 -0700] "HEAD /e-business/mindshare HTTP/1.0" 404 19927,127.45.152.6 - pouros8756 [28/Sep/2022:10:15:57 -0700] "GET /architectures/convergence/niches/mindshare HTTP/1.0" 100 28722,118.223.210.105 - - [28/Sep/2022:10:15:57 -0700] "PATCH /strategize/out-of-the-box HTTP/1.0" 401 27439,210.204.15.104 - - [28/Sep/2022:10:15:57 -0700] "POST /users HTTP/1.1" 301 9481] | - +----------------------------------------------------------------------+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - -Brain Example 4: Return log patterns aggregation result with detected variable tokens -===================================================================================== - -This example shows how to get aggregated results with detected variable tokens using the brain algorithm. - -Configuration -------------- -With option ``show_numbered_token`` enabled, the output can detect numbered variable tokens from the pattern field. - -PPL query:: - - os> source=apache | patterns message method=brain mode=aggregation show_numbered_token=true variable_count_threshold=2 | fields patterns_field, pattern_count, tokens ; - fetched rows / total rows = 1/1 - +----------------------------------------------------------------------------------------------------------------------------------------+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | patterns_field | pattern_count | tokens | - |----------------------------------------------------------------------------------------------------------------------------------------+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| - | - [/Sep/::: ] HTTP/" | 4 | {'': ['19927', '28722', '27439', '9481'], '': ['10', '10', '10', '10'], '': ['2022', '2022', '2022', '2022'], '': ['57', '57', '57', '57'], '': ['15', '15', '15', '15'], '': ['"HEAD', '"GET', '"PATCH', '"POST'], '': ['-0700', '-0700', '-0700', '-0700'], '': ['/e-business/mindshare', '/architectures/convergence/niches/mindshare', '/strategize/out-of-the-box', '/users'], '': ['177.95.8.74', '127.45.152.6', '118.223.210.10... | - +----------------------------------------------------------------------------------------------------------------------------------------+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - -Limitations -=========== - -- Patterns command is not pushed down to OpenSearch data node for now. It will only group log patterns on log messages returned to coordinator node. diff --git a/docs/user/ppl/cmd/rare.md b/docs/user/ppl/cmd/rare.md new file mode 100644 index 00000000000..6ee51c9f96a --- /dev/null +++ b/docs/user/ppl/cmd/rare.md @@ -0,0 +1,146 @@ +# rare + +## Description + +The `rare` command finds the least common tuple of values of all fields in the field list. +**Note**: A maximum of 10 results is returned for each distinct tuple of values of the group-by fields. +## Syntax + +rare [rare-options] \ [by-clause] +* field-list: mandatory. Comma-delimited list of field names. +* by-clause: optional. One or more fields to group the results by. +* rare-options: optional. Options for the rare command. Supported syntax is [countfield=\] [showcount=\]. +* showcount=\: optional. Whether to create a field in output that represent a count of the tuple of values. **Default:** `true`. +* countfield=\: optional. The name of the field that contains count. **Default:** `'count'`. +* usenull=\: optional. whether to output the null value. **Default:** Determined by `plugins.ppl.syntax.legacy.preferred`: + * When `plugins.ppl.syntax.legacy.preferred=true`, `usenull` defaults to `true` + * When `plugins.ppl.syntax.legacy.preferred=false`, `usenull` defaults to `false` + +## Example 1: Find the least common values in a field + +This example shows how to find the least common gender of all the accounts. + +```ppl +source=accounts +| rare showcount=false gender +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++--------+ +| gender | +|--------| +| F | +| M | ++--------+ +``` + +## Example 2: Find the least common values organized by gender + +This example shows how to find the least common age of all the accounts grouped by gender. + +```ppl +source=accounts +| rare showcount=false age by gender +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++--------+-----+ +| gender | age | +|--------+-----| +| F | 28 | +| M | 32 | +| M | 33 | +| M | 36 | ++--------+-----+ +``` + +## Example 3: Rare command + +This example shows how to find the least common gender of all the accounts. + +```ppl +source=accounts +| rare gender +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++--------+-------+ +| gender | count | +|--------+-------| +| F | 1 | +| M | 3 | ++--------+-------+ +``` + +## Example 4: Specify the count field option + +This example shows how to specify the count field. + +```ppl +source=accounts +| rare countfield='cnt' gender +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++--------+-----+ +| gender | cnt | +|--------+-----| +| F | 1 | +| M | 3 | ++--------+-----+ +``` + +## Example 5: Specify the usenull field option + +```ppl +source=accounts +| rare usenull=false email +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++-----------------------+-------+ +| email | count | +|-----------------------+-------| +| amberduke@pyrami.com | 1 | +| daleadams@boink.com | 1 | +| hattiebond@netagy.com | 1 | ++-----------------------+-------+ +``` + +```ppl +source=accounts +| rare usenull=true email +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------------------+-------+ +| email | count | +|-----------------------+-------| +| null | 1 | +| amberduke@pyrami.com | 1 | +| daleadams@boink.com | 1 | +| hattiebond@netagy.com | 1 | ++-----------------------+-------+ +``` + +## Limitations + +The `rare` command is not rewritten to OpenSearch DSL, it is only executed on the coordination node. \ No newline at end of file diff --git a/docs/user/ppl/cmd/rare.rst b/docs/user/ppl/cmd/rare.rst deleted file mode 100644 index e72c8c8c2c9..00000000000 --- a/docs/user/ppl/cmd/rare.rst +++ /dev/null @@ -1,132 +0,0 @@ -==== -rare -==== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| The ``rare`` command finds the least common tuple of values of all fields in the field list. - -| **Note**: A maximum of 10 results is returned for each distinct tuple of values of the group-by fields. - -Syntax -====== -rare [rare-options] [by-clause] - -* field-list: mandatory. Comma-delimited list of field names. -* by-clause: optional. One or more fields to group the results by. -* rare-options: optional. Options for the rare command. Supported syntax is [countfield=] [showcount=]. -* showcount=: optional. Whether to create a field in output that represent a count of the tuple of values. **Default:** ``true``. -* countfield=: optional. The name of the field that contains count. **Default:** ``'count'``. -* usenull=: optional. whether to output the null value. **Default:** Determined by ``plugins.ppl.syntax.legacy.preferred``: - - * When ``plugins.ppl.syntax.legacy.preferred=true``, ``usenull`` defaults to ``true`` - * When ``plugins.ppl.syntax.legacy.preferred=false``, ``usenull`` defaults to ``false`` - -Example 1: Find the least common values in a field -================================================== - -This example shows how to find the least common gender of all the accounts. - -PPL query:: - - os> source=accounts | rare showcount=false gender; - fetched rows / total rows = 2/2 - +--------+ - | gender | - |--------| - | F | - | M | - +--------+ - - -Example 2: Find the least common values organized by gender -=========================================================== - -This example shows how to find the least common age of all the accounts grouped by gender. - -PPL query:: - - os> source=accounts | rare showcount=false age by gender; - fetched rows / total rows = 4/4 - +--------+-----+ - | gender | age | - |--------+-----| - | F | 28 | - | M | 32 | - | M | 33 | - | M | 36 | - +--------+-----+ - -Example 3: Rare command -======================= - -This example shows how to find the least common gender of all the accounts. - -PPL query:: - - os> source=accounts | rare gender; - fetched rows / total rows = 2/2 - +--------+-------+ - | gender | count | - |--------+-------| - | F | 1 | - | M | 3 | - +--------+-------+ - - -Example 4: Specify the count field option -========================================= - -This example shows how to specify the count field. - -PPL query:: - - os> source=accounts | rare countfield='cnt' gender; - fetched rows / total rows = 2/2 - +--------+-----+ - | gender | cnt | - |--------+-----| - | F | 1 | - | M | 3 | - +--------+-----+ - - -Example 5: Specify the usenull field option -=========================================== - -PPL query:: - - os> source=accounts | rare usenull=false email; - fetched rows / total rows = 3/3 - +-----------------------+-------+ - | email | count | - |-----------------------+-------| - | amberduke@pyrami.com | 1 | - | daleadams@boink.com | 1 | - | hattiebond@netagy.com | 1 | - +-----------------------+-------+ - -PPL query:: - - os> source=accounts | rare usenull=true email; - fetched rows / total rows = 4/4 - +-----------------------+-------+ - | email | count | - |-----------------------+-------| - | null | 1 | - | amberduke@pyrami.com | 1 | - | daleadams@boink.com | 1 | - | hattiebond@netagy.com | 1 | - +-----------------------+-------+ - - -Limitations -=========== -The ``rare`` command is not rewritten to OpenSearch DSL, it is only executed on the coordination node. diff --git a/docs/user/ppl/cmd/regex.md b/docs/user/ppl/cmd/regex.md new file mode 100644 index 00000000000..d108b635abc --- /dev/null +++ b/docs/user/ppl/cmd/regex.md @@ -0,0 +1,155 @@ +# regex + +## Description + +The `regex` command filters search results by matching field values against a regular expression pattern. Only documents where the specified field matches the pattern are included in the results. +## Syntax + +regex \ = \ +regex \ != \ +* field: mandatory. The field name to match against. +* pattern: mandatory string. The regular expression pattern to match. Supports Java regex syntax including named groups, lookahead/lookbehind, and character classes. +* = : operator for positive matching (include matches) +* != : operator for negative matching (exclude matches) + +## Regular Expression Engine + +The regex command uses Java's built-in regular expression engine, which supports: +* **Standard regex features**: Character classes, quantifiers, anchors +* **Named capture groups**: `(?pattern)` syntax +* **Lookahead/lookbehind**: `(?=...)` and `(?<=...)` assertions +* **Inline flags**: Case-insensitive `(?i)`, multiline `(?m)`, dotall `(?s)`, and other modes + +For complete documentation of Java regex patterns and available modes, see the [Java Pattern documentation](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). +## Example 1: Basic pattern matching + +This example shows how to filter documents where the `lastname` field matches names starting with uppercase letters. + +```ppl +source=accounts +| regex lastname="^[A-Z][a-z]+$" +| fields account_number, firstname, lastname +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+-----------+----------+ +| account_number | firstname | lastname | +|----------------+-----------+----------| +| 1 | Amber | Duke | +| 6 | Hattie | Bond | +| 13 | Nanette | Bates | +| 18 | Dale | Adams | ++----------------+-----------+----------+ +``` + +## Example 2: Negative matching + +This example shows how to exclude documents where the `lastname` field ends with "son". + +```ppl +source=accounts +| regex lastname!=".*son$" +| fields account_number, lastname +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+----------+ +| account_number | lastname | +|----------------+----------| +| 1 | Duke | +| 6 | Bond | +| 13 | Bates | +| 18 | Adams | ++----------------+----------+ +``` + +## Example 3: Email domain matching + +This example shows how to filter documents by email domain patterns. + +```ppl +source=accounts +| regex email="@pyrami\.com$" +| fields account_number, email +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------+----------------------+ +| account_number | email | +|----------------+----------------------| +| 1 | amberduke@pyrami.com | ++----------------+----------------------+ +``` + +## Example 4: Complex patterns with character classes + +This example shows how to use complex regex patterns with character classes and quantifiers. + +```ppl +source=accounts | regex address="\\d{3,4}\\s+[A-Z][a-z]+\\s+(Street|Lane|Court)" | fields account_number, address +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+----------------------+ +| account_number | address | +|----------------+----------------------| +| 1 | 880 Holmes Lane | +| 6 | 671 Bristol Street | +| 13 | 789 Madison Street | +| 18 | 467 Hutchinson Court | ++----------------+----------------------+ +``` + +## Example 5: Case-sensitive matching + +This example demonstrates that regex matching is case-sensitive by default. + +```ppl +source=accounts +| regex state="va" +| fields account_number, state +``` + +Expected output: + +```text +fetched rows / total rows = 0/0 ++----------------+-------+ +| account_number | state | +|----------------+-------| ++----------------+-------+ +``` + +```ppl +source=accounts +| regex state="VA" +| fields account_number, state +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------+-------+ +| account_number | state | +|----------------+-------| +| 13 | VA | ++----------------+-------+ +``` + +## Limitations + +* **Field specification required**: A field name must be specified in the regex command. Pattern-only syntax (e.g., `regex "pattern"`) is not currently supported +* **String fields only**: The regex command currently only supports string fields. Using it on numeric or boolean fields will result in an error \ No newline at end of file diff --git a/docs/user/ppl/cmd/regex.rst b/docs/user/ppl/cmd/regex.rst deleted file mode 100644 index 154949ba133..00000000000 --- a/docs/user/ppl/cmd/regex.rst +++ /dev/null @@ -1,140 +0,0 @@ -===== -regex -===== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| The ``regex`` command filters search results by matching field values against a regular expression pattern. Only documents where the specified field matches the pattern are included in the results. - -Syntax -====== -regex = -regex != - -* field: mandatory. The field name to match against. -* pattern: mandatory string. The regular expression pattern to match. Supports Java regex syntax including named groups, lookahead/lookbehind, and character classes. -* = : operator for positive matching (include matches) -* != : operator for negative matching (exclude matches) - -Regular Expression Engine -========================= - -The regex command uses Java's built-in regular expression engine, which supports: - -* **Standard regex features**: Character classes, quantifiers, anchors -* **Named capture groups**: ``(?pattern)`` syntax -* **Lookahead/lookbehind**: ``(?=...)`` and ``(?<=...)`` assertions -* **Inline flags**: Case-insensitive ``(?i)``, multiline ``(?m)``, dotall ``(?s)``, and other modes - -For complete documentation of Java regex patterns and available modes, see the `Java Pattern documentation `_. - -Example 1: Basic pattern matching -================================= - -This example shows how to filter documents where the ``lastname`` field matches names starting with uppercase letters. - -PPL query:: - - os> source=accounts | regex lastname="^[A-Z][a-z]+$" | fields account_number, firstname, lastname; - fetched rows / total rows = 4/4 - +----------------+-----------+----------+ - | account_number | firstname | lastname | - |----------------+-----------+----------| - | 1 | Amber | Duke | - | 6 | Hattie | Bond | - | 13 | Nanette | Bates | - | 18 | Dale | Adams | - +----------------+-----------+----------+ - - -Example 2: Negative matching -============================ - -This example shows how to exclude documents where the ``lastname`` field ends with "son". - -PPL query:: - - os> source=accounts | regex lastname!=".*son$" | fields account_number, lastname; - fetched rows / total rows = 4/4 - +----------------+----------+ - | account_number | lastname | - |----------------+----------| - | 1 | Duke | - | 6 | Bond | - | 13 | Bates | - | 18 | Adams | - +----------------+----------+ - - -Example 3: Email domain matching -================================ - -This example shows how to filter documents by email domain patterns. - -PPL query:: - - os> source=accounts | regex email="@pyrami\.com$" | fields account_number, email; - fetched rows / total rows = 1/1 - +----------------+----------------------+ - | account_number | email | - |----------------+----------------------| - | 1 | amberduke@pyrami.com | - +----------------+----------------------+ - - -Example 4: Complex patterns with character classes -================================================== - -This example shows how to use complex regex patterns with character classes and quantifiers. - -PPL query:: - - os> source=accounts | regex address="\d{3,4}\s+[A-Z][a-z]+\s+(Street|Lane|Court)" | fields account_number, address; - fetched rows / total rows = 4/4 - +----------------+----------------------+ - | account_number | address | - |----------------+----------------------| - | 1 | 880 Holmes Lane | - | 6 | 671 Bristol Street | - | 13 | 789 Madison Street | - | 18 | 467 Hutchinson Court | - +----------------+----------------------+ - - -Example 5: Case-sensitive matching -================================== - -This example demonstrates that regex matching is case-sensitive by default. - -PPL query:: - - os> source=accounts | regex state="va" | fields account_number, state; - fetched rows / total rows = 0/0 - +----------------+-------+ - | account_number | state | - |----------------+-------| - +----------------+-------+ - -PPL query:: - - os> source=accounts | regex state="VA" | fields account_number, state; - fetched rows / total rows = 1/1 - +----------------+-------+ - | account_number | state | - |----------------+-------| - | 13 | VA | - +----------------+-------+ - - -Limitations -=========== - -| * **Field specification required**: A field name must be specified in the regex command. Pattern-only syntax (e.g., ``regex "pattern"``) is not currently supported -| * **String fields only**: The regex command currently only supports string fields. Using it on numeric or boolean fields will result in an error diff --git a/docs/user/ppl/cmd/rename.md b/docs/user/ppl/cmd/rename.md new file mode 100644 index 00000000000..346513f232e --- /dev/null +++ b/docs/user/ppl/cmd/rename.md @@ -0,0 +1,142 @@ +# rename + +## Description + +The `rename` command renames one or more fields in the search result. +## Syntax + +rename \ AS \["," \ AS \]... +* source-field: mandatory. The name of the field you want to rename. Supports wildcard patterns using `*`. +* target-field: mandatory. The name you want to rename to. Must have same number of wildcards as the source. + +## Behavior + +The rename command handles non-existent fields as follows: +* **Renaming a non-existent field to a non-existent field**: No change occurs to the result set. +* **Renaming a non-existent field to an existing field**: The existing target field is removed from the result set. +* **Renaming an existing field to an existing field**: The existing target field is removed and the source field is renamed to the target. + +## Example 1: Rename one field + +This example shows how to rename one field. + +```ppl +source=accounts +| rename account_number as an +| fields an +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----+ +| an | +|----| +| 1 | +| 6 | +| 13 | +| 18 | ++----+ +``` + +## Example 2: Rename multiple fields + +This example shows how to rename multiple fields. + +```ppl +source=accounts +| rename account_number as an, employer as emp +| fields an, emp +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----+---------+ +| an | emp | +|----+---------| +| 1 | Pyrami | +| 6 | Netagy | +| 13 | Quility | +| 18 | null | ++----+---------+ +``` + +## Example 3: Rename with wildcards + +This example shows how to rename multiple fields using wildcard patterns. + +```ppl +source=accounts +| rename *name as *_name +| fields first_name, last_name +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++------------+-----------+ +| first_name | last_name | +|------------+-----------| +| Amber | Duke | +| Hattie | Bond | +| Nanette | Bates | +| Dale | Adams | ++------------+-----------+ +``` + +## Example 4: Rename with multiple wildcard patterns + +This example shows how to rename multiple fields using multiple wildcard patterns. + +```ppl +source=accounts +| rename *name as *_name, *_number as *number +| fields first_name, last_name, accountnumber +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++------------+-----------+---------------+ +| first_name | last_name | accountnumber | +|------------+-----------+---------------| +| Amber | Duke | 1 | +| Hattie | Bond | 6 | +| Nanette | Bates | 13 | +| Dale | Adams | 18 | ++------------+-----------+---------------+ +``` + +## Example 5: Rename existing field to existing field + +This example shows how to rename an existing field to an existing field. The target field gets removed and the source field is renamed to the target field. + +```ppl +source=accounts +| rename firstname as age +| fields age +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++---------+ +| age | +|---------| +| Amber | +| Hattie | +| Nanette | +| Dale | ++---------+ +``` + +## Limitations + +The `rename` command is not rewritten to OpenSearch DSL, it is only executed on the coordination node. +Literal asterisk (*) characters in field names cannot be replaced as asterisk is used for wildcard matching. \ No newline at end of file diff --git a/docs/user/ppl/cmd/rename.rst b/docs/user/ppl/cmd/rename.rst deleted file mode 100644 index eb92a45b8cb..00000000000 --- a/docs/user/ppl/cmd/rename.rst +++ /dev/null @@ -1,130 +0,0 @@ -====== -rename -====== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| The ``rename`` command renames one or more fields in the search result. - -Syntax -====== -rename AS ["," AS ]... - -* source-field: mandatory. The name of the field you want to rename. Supports wildcard patterns using ``*``. -* target-field: mandatory. The name you want to rename to. Must have same number of wildcards as the source. - -Behavior -======== - -The rename command handles non-existent fields as follows: - -* **Renaming a non-existent field to a non-existent field**: No change occurs to the result set. -* **Renaming a non-existent field to an existing field**: The existing target field is removed from the result set. -* **Renaming an existing field to an existing field**: The existing target field is removed and the source field is renamed to the target. - -Example 1: Rename one field -=========================== - -This example shows how to rename one field. - -PPL query:: - - os> source=accounts | rename account_number as an | fields an; - fetched rows / total rows = 4/4 - +----+ - | an | - |----| - | 1 | - | 6 | - | 13 | - | 18 | - +----+ - - -Example 2: Rename multiple fields -================================= - -This example shows how to rename multiple fields. - -PPL query:: - - os> source=accounts | rename account_number as an, employer as emp | fields an, emp; - fetched rows / total rows = 4/4 - +----+---------+ - | an | emp | - |----+---------| - | 1 | Pyrami | - | 6 | Netagy | - | 13 | Quility | - | 18 | null | - +----+---------+ - - -Example 3: Rename with wildcards -================================ - -This example shows how to rename multiple fields using wildcard patterns. - -PPL query:: - - os> source=accounts | rename *name as *_name | fields first_name, last_name; - fetched rows / total rows = 4/4 - +------------+-----------+ - | first_name | last_name | - |------------+-----------| - | Amber | Duke | - | Hattie | Bond | - | Nanette | Bates | - | Dale | Adams | - +------------+-----------+ - - -Example 4: Rename with multiple wildcard patterns -================================================= - -This example shows how to rename multiple fields using multiple wildcard patterns. - -PPL query:: - - os> source=accounts | rename *name as *_name, *_number as *number | fields first_name, last_name, accountnumber; - fetched rows / total rows = 4/4 - +------------+-----------+---------------+ - | first_name | last_name | accountnumber | - |------------+-----------+---------------| - | Amber | Duke | 1 | - | Hattie | Bond | 6 | - | Nanette | Bates | 13 | - | Dale | Adams | 18 | - +------------+-----------+---------------+ - -Example 5: Rename existing field to existing field -================================================== - -This example shows how to rename an existing field to an existing field. The target field gets removed and the source field is renamed to the target field. - - -PPL query:: - - os> source=accounts | rename firstname as age | fields age; - fetched rows / total rows = 4/4 - +---------+ - | age | - |---------| - | Amber | - | Hattie | - | Nanette | - | Dale | - +---------+ - - -Limitations -=========== -| The ``rename`` command is not rewritten to OpenSearch DSL, it is only executed on the coordination node. -| Literal asterisk (*) characters in field names cannot be replaced as asterisk is used for wildcard matching. diff --git a/docs/user/ppl/cmd/replace.md b/docs/user/ppl/cmd/replace.md new file mode 100644 index 00000000000..2333f46b3b2 --- /dev/null +++ b/docs/user/ppl/cmd/replace.md @@ -0,0 +1,330 @@ +# replace + +## Description + +The `replace` replaces text in one or more fields in the search result. Supports literal string replacement and wildcard patterns using `*`. +## Syntax + +replace '\' WITH '\' [, '\' WITH '\']... IN \[, \]... +* pattern: mandatory. The text pattern you want to replace. +* replacement: mandatory. The text you want to replace with. +* field-name: mandatory. One or more field names where the replacement should occur. + +## Example 1: Replace text in one field + +This example shows replacing text in one field. + +```ppl +source=accounts +| replace "IL" WITH "Illinois" IN state +| fields state +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------+ +| state | +|----------| +| Illinois | +| TN | +| VA | +| MD | ++----------+ +``` + +## Example 2: Replace text in multiple fields + +This example shows replacing text in multiple fields. + +```ppl +source=accounts +| replace "IL" WITH "Illinois" IN state, address +| fields state, address +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------+----------------------+ +| state | address | +|----------+----------------------| +| Illinois | 880 Holmes Lane | +| TN | 671 Bristol Street | +| VA | 789 Madison Street | +| MD | 467 Hutchinson Court | ++----------+----------------------+ +``` + +## Example 3: Replace with other commands in a pipeline + +This example shows using replace with other commands in a query pipeline. + +```ppl +source=accounts +| replace "IL" WITH "Illinois" IN state +| where age > 30 +| fields state, age +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++----------+-----+ +| state | age | +|----------+-----| +| Illinois | 32 | +| TN | 36 | +| MD | 33 | ++----------+-----+ +``` + +## Example 4: Replace with multiple pattern/replacement pairs + +This example shows using multiple pattern/replacement pairs in a single replace command. The replacements are applied sequentially. + +```ppl +source=accounts +| replace "IL" WITH "Illinois", "TN" WITH "Tennessee" IN state +| fields state +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------+ +| state | +|-----------| +| Illinois | +| Tennessee | +| VA | +| MD | ++-----------+ +``` + +## Example 5: Pattern matching with LIKE and replace + +Since replace command only supports plain string literals, you can use LIKE command with replace for pattern matching needs. + +```ppl +source=accounts +| where LIKE(address, '%Holmes%') +| replace "Holmes" WITH "HOLMES" IN address +| fields address, state, gender, age, city +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------+-------+--------+-----+--------+ +| address | state | gender | age | city | +|-----------------+-------+--------+-----+--------| +| 880 HOLMES Lane | IL | M | 32 | Brogan | ++-----------------+-------+--------+-----+--------+ +``` + +## Example 6: Wildcard suffix match + +Replace values that end with a specific pattern. The wildcard `*` matches any prefix. + +```ppl +source=accounts +| replace "*IL" WITH "Illinois" IN state +| fields state +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------+ +| state | +|----------| +| Illinois | +| TN | +| VA | +| MD | ++----------+ +``` + +## Example 7: Wildcard prefix match + +Replace values that start with a specific pattern. The wildcard `*` matches any suffix. + +```ppl +source=accounts +| replace "IL*" WITH "Illinois" IN state +| fields state +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------+ +| state | +|----------| +| Illinois | +| TN | +| VA | +| MD | ++----------+ +``` + +## Example 8: Wildcard capture and substitution + +Use wildcards in both pattern and replacement to capture and reuse matched portions. The number of wildcards must match in pattern and replacement. + +```ppl +source=accounts +| replace "* Lane" WITH "Lane *" IN address +| fields address +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------------+ +| address | +|----------------------| +| Lane 880 Holmes | +| 671 Bristol Street | +| 789 Madison Street | +| 467 Hutchinson Court | ++----------------------+ +``` + +## Example 9: Multiple wildcards for pattern transformation + +Use multiple wildcards to transform patterns. Each wildcard in the replacement substitutes the corresponding captured value. + +```ppl +source=accounts +| replace "* *" WITH "*_*" IN address +| fields address +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------------+ +| address | +|----------------------| +| 880_Holmes Lane | +| 671_Bristol Street | +| 789_Madison Street | +| 467_Hutchinson Court | ++----------------------+ +``` + +## Example 10: Wildcard with zero wildcards in replacement + +When replacement has zero wildcards, all matching values are replaced with the literal replacement string. + +```ppl +source=accounts +| replace "*IL*" WITH "Illinois" IN state +| fields state +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------+ +| state | +|----------| +| Illinois | +| TN | +| VA | +| MD | ++----------+ +``` + +## Example 11: Matching literal asterisks + +Use `\*` to match literal asterisk characters (`\*` = literal asterisk, `\\` = literal backslash). + +```ppl +source=accounts +| eval note = 'price: *sale*' +| replace 'price: \*sale\*' WITH 'DISCOUNTED' IN note +| fields note +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++------------+ +| note | +|------------| +| DISCOUNTED | +| DISCOUNTED | +| DISCOUNTED | +| DISCOUNTED | ++------------+ +``` + +## Example 12: Wildcard with no replacement wildcards + +Use wildcards in pattern but none in replacement to create a fixed output. + +```ppl +source=accounts +| eval test = 'prefix-value-suffix' +| replace 'prefix-*-suffix' WITH 'MATCHED' IN test +| fields test +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++---------+ +| test | +|---------| +| MATCHED | +| MATCHED | +| MATCHED | +| MATCHED | ++---------+ +``` + +## Example 13: Escaped asterisks with wildcards + +Combine escaped asterisks (literal) with wildcards for complex patterns. + +```ppl +source=accounts +| eval label = 'file123.txt' +| replace 'file*.*' WITH '\**.*' IN label +| fields label +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------+ +| label | +|----------| +| *123.txt | +| *123.txt | +| *123.txt | +| *123.txt | ++----------+ +``` + +## Limitations + +* Wildcards: `*` matches zero or more characters (case-sensitive) +* Replacement wildcards must match pattern wildcard count, or be zero +* Escape sequences: `\*` (literal asterisk), `\\` (literal backslash) \ No newline at end of file diff --git a/docs/user/ppl/cmd/replace.rst b/docs/user/ppl/cmd/replace.rst deleted file mode 100644 index 60a28bc8ce0..00000000000 --- a/docs/user/ppl/cmd/replace.rst +++ /dev/null @@ -1,268 +0,0 @@ -======= -replace -======= - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -The ``replace`` replaces text in one or more fields in the search result. Supports literal string replacement and wildcard patterns using ``*``. - - -Syntax -====== -replace '' WITH '' [, '' WITH '']... IN [, ]... - -* pattern: mandatory. The text pattern you want to replace. -* replacement: mandatory. The text you want to replace with. -* field-name: mandatory. One or more field names where the replacement should occur. - -Example 1: Replace text in one field -==================================== - -This example shows replacing text in one field. - -PPL query:: - - os> source=accounts | replace "IL" WITH "Illinois" IN state | fields state; - fetched rows / total rows = 4/4 - +----------+ - | state | - |----------| - | Illinois | - | TN | - | VA | - | MD | - +----------+ - - -Example 2: Replace text in multiple fields -========================================== - -This example shows replacing text in multiple fields. - -PPL query:: - - os> source=accounts | replace "IL" WITH "Illinois" IN state, address | fields state, address; - fetched rows / total rows = 4/4 - +----------+----------------------+ - | state | address | - |----------+----------------------| - | Illinois | 880 Holmes Lane | - | TN | 671 Bristol Street | - | VA | 789 Madison Street | - | MD | 467 Hutchinson Court | - +----------+----------------------+ - - -Example 3: Replace with other commands in a pipeline -==================================================== - -This example shows using replace with other commands in a query pipeline. - -PPL query:: - - os> source=accounts | replace "IL" WITH "Illinois" IN state | where age > 30 | fields state, age; - fetched rows / total rows = 3/3 - +----------+-----+ - | state | age | - |----------+-----| - | Illinois | 32 | - | TN | 36 | - | MD | 33 | - +----------+-----+ - -Example 4: Replace with multiple pattern/replacement pairs -========================================================== - -This example shows using multiple pattern/replacement pairs in a single replace command. The replacements are applied sequentially. - -PPL query:: - - os> source=accounts | replace "IL" WITH "Illinois", "TN" WITH "Tennessee" IN state | fields state; - fetched rows / total rows = 4/4 - +-----------+ - | state | - |-----------| - | Illinois | - | Tennessee | - | VA | - | MD | - +-----------+ - -Example 5: Pattern matching with LIKE and replace -================================================= - -Since replace command only supports plain string literals, you can use LIKE command with replace for pattern matching needs. - -PPL query:: - - os> source=accounts | where LIKE(address, '%Holmes%') | replace "Holmes" WITH "HOLMES" IN address | fields address, state, gender, age, city; - fetched rows / total rows = 1/1 - +-----------------+-------+--------+-----+--------+ - | address | state | gender | age | city | - |-----------------+-------+--------+-----+--------| - | 880 HOLMES Lane | IL | M | 32 | Brogan | - +-----------------+-------+--------+-----+--------+ - - -Example 6: Wildcard suffix match ---------------------------------- - -Replace values that end with a specific pattern. The wildcard ``*`` matches any prefix. - -PPL query:: - - os> source=accounts | replace "*IL" WITH "Illinois" IN state | fields state; - fetched rows / total rows = 4/4 - +----------+ - | state | - |----------| - | Illinois | - | TN | - | VA | - | MD | - +----------+ - - -Example 7: Wildcard prefix match ---------------------------------- - -Replace values that start with a specific pattern. The wildcard ``*`` matches any suffix. - -PPL query:: - - os> source=accounts | replace "IL*" WITH "Illinois" IN state | fields state; - fetched rows / total rows = 4/4 - +----------+ - | state | - |----------| - | Illinois | - | TN | - | VA | - | MD | - +----------+ - - -Example 8: Wildcard capture and substitution ---------------------------------------------- - -Use wildcards in both pattern and replacement to capture and reuse matched portions. The number of wildcards must match in pattern and replacement. - -PPL query:: - - os> source=accounts | replace "* Lane" WITH "Lane *" IN address | fields address; - fetched rows / total rows = 4/4 - +----------------------+ - | address | - |----------------------| - | Lane 880 Holmes | - | 671 Bristol Street | - | 789 Madison Street | - | 467 Hutchinson Court | - +----------------------+ - - -Example 9: Multiple wildcards for pattern transformation ---------------------------------------------------------- - -Use multiple wildcards to transform patterns. Each wildcard in the replacement substitutes the corresponding captured value. - -PPL query:: - - os> source=accounts | replace "* *" WITH "*_*" IN address | fields address; - fetched rows / total rows = 4/4 - +----------------------+ - | address | - |----------------------| - | 880_Holmes Lane | - | 671_Bristol Street | - | 789_Madison Street | - | 467_Hutchinson Court | - +----------------------+ - - -Example 10: Wildcard with zero wildcards in replacement --------------------------------------------------------- - -When replacement has zero wildcards, all matching values are replaced with the literal replacement string. - -PPL query:: - - os> source=accounts | replace "*IL*" WITH "Illinois" IN state | fields state; - fetched rows / total rows = 4/4 - +----------+ - | state | - |----------| - | Illinois | - | TN | - | VA | - | MD | - +----------+ - - -Example 11: Matching literal asterisks ---------------------------------------- - -Use ``\*`` to match literal asterisk characters (``\*`` = literal asterisk, ``\\`` = literal backslash). - -PPL query:: - - os> source=accounts | eval note = 'price: *sale*' | replace 'price: \*sale\*' WITH 'DISCOUNTED' IN note | fields note; - fetched rows / total rows = 4/4 - +------------+ - | note | - |------------| - | DISCOUNTED | - | DISCOUNTED | - | DISCOUNTED | - | DISCOUNTED | - +------------+ - -Example 12: Wildcard with no replacement wildcards ----------------------------------------------------- - -Use wildcards in pattern but none in replacement to create a fixed output. - -PPL query:: - - os> source=accounts | eval test = 'prefix-value-suffix' | replace 'prefix-*-suffix' WITH 'MATCHED' IN test | fields test; - fetched rows / total rows = 4/4 - +---------+ - | test | - |---------| - | MATCHED | - | MATCHED | - | MATCHED | - | MATCHED | - +---------+ - -Example 13: Escaped asterisks with wildcards ---------------------------------------------- - -Combine escaped asterisks (literal) with wildcards for complex patterns. - -PPL query:: - - os> source=accounts | eval label = 'file123.txt' | replace 'file*.*' WITH '\**.*' IN label | fields label; - fetched rows / total rows = 4/4 - +----------+ - | label | - |----------| - | *123.txt | - | *123.txt | - | *123.txt | - | *123.txt | - +----------+ - - -Limitations -=========== -* Wildcards: ``*`` matches zero or more characters (case-sensitive) -* Replacement wildcards must match pattern wildcard count, or be zero -* Escape sequences: ``\*`` (literal asterisk), ``\\`` (literal backslash) \ No newline at end of file diff --git a/docs/user/ppl/cmd/reverse.md b/docs/user/ppl/cmd/reverse.md new file mode 100644 index 00000000000..f63a8f18e95 --- /dev/null +++ b/docs/user/ppl/cmd/reverse.md @@ -0,0 +1,134 @@ +# reverse + +## Description + +The `reverse` command reverses the display order of search results. The same results are returned, but in reverse order. +## Syntax + +reverse +* No parameters: The reverse command takes no arguments or options. + +## Note + +The `reverse` command processes the entire dataset. If applied directly to millions of records, it will consume significant memory resources on the coordinating node. Users should only apply the `reverse` command to smaller datasets, typically after aggregation operations. +## Example 1: Basic reverse operation + +This example shows reversing the order of all documents. + +```ppl +source=accounts +| fields account_number, age +| reverse +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+-----+ +| account_number | age | +|----------------+-----| +| 6 | 36 | +| 18 | 33 | +| 1 | 32 | +| 13 | 28 | ++----------------+-----+ +``` + +## Example 2: Reverse with sort + +This example shows reversing results after sorting by age in ascending order, effectively giving descending order. + +```ppl +source=accounts +| sort age +| fields account_number, age +| reverse +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+-----+ +| account_number | age | +|----------------+-----| +| 6 | 36 | +| 18 | 33 | +| 1 | 32 | +| 13 | 28 | ++----------------+-----+ +``` + +## Example 3: Reverse with head + +This example shows using reverse with head to get the last 2 records from the original order. + +```ppl +source=accounts +| reverse +| head 2 +| fields account_number, age +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++----------------+-----+ +| account_number | age | +|----------------+-----| +| 6 | 36 | +| 18 | 33 | ++----------------+-----+ +``` + +## Example 4: Double reverse + +This example shows that applying reverse twice returns to the original order. + +```ppl +source=accounts +| reverse +| reverse +| fields account_number, age +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+-----+ +| account_number | age | +|----------------+-----| +| 13 | 28 | +| 1 | 32 | +| 18 | 33 | +| 6 | 36 | ++----------------+-----+ +``` + +## Example 5: Reverse with complex pipeline + +This example shows reverse working with filtering and field selection. + +```ppl +source=accounts +| where age > 30 +| fields account_number, age +| reverse +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++----------------+-----+ +| account_number | age | +|----------------+-----| +| 6 | 36 | +| 18 | 33 | +| 1 | 32 | ++----------------+-----+ +``` + \ No newline at end of file diff --git a/docs/user/ppl/cmd/reverse.rst b/docs/user/ppl/cmd/reverse.rst deleted file mode 100644 index d839a687bf9..00000000000 --- a/docs/user/ppl/cmd/reverse.rst +++ /dev/null @@ -1,115 +0,0 @@ -======= -reverse -======= - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| The ``reverse`` command reverses the display order of search results. The same results are returned, but in reverse order. - -Syntax -====== -reverse - -* No parameters: The reverse command takes no arguments or options. - -Note -==== -| The `reverse` command processes the entire dataset. If applied directly to millions of records, it will consume significant memory resources on the coordinating node. Users should only apply the `reverse` command to smaller datasets, typically after aggregation operations. - -Example 1: Basic reverse operation -================================== - -This example shows reversing the order of all documents. - -PPL query:: - - os> source=accounts | fields account_number, age | reverse; - fetched rows / total rows = 4/4 - +----------------+-----+ - | account_number | age | - |----------------+-----| - | 6 | 36 | - | 18 | 33 | - | 1 | 32 | - | 13 | 28 | - +----------------+-----+ - - -Example 2: Reverse with sort -============================ - -This example shows reversing results after sorting by age in ascending order, effectively giving descending order. - -PPL query:: - - os> source=accounts | sort age | fields account_number, age | reverse; - fetched rows / total rows = 4/4 - +----------------+-----+ - | account_number | age | - |----------------+-----| - | 6 | 36 | - | 18 | 33 | - | 1 | 32 | - | 13 | 28 | - +----------------+-----+ - - -Example 3: Reverse with head -============================ - -This example shows using reverse with head to get the last 2 records from the original order. - -PPL query:: - - os> source=accounts | reverse | head 2 | fields account_number, age; - fetched rows / total rows = 2/2 - +----------------+-----+ - | account_number | age | - |----------------+-----| - | 6 | 36 | - | 18 | 33 | - +----------------+-----+ - - -Example 4: Double reverse -========================= - -This example shows that applying reverse twice returns to the original order. - -PPL query:: - - os> source=accounts | reverse | reverse | fields account_number, age; - fetched rows / total rows = 4/4 - +----------------+-----+ - | account_number | age | - |----------------+-----| - | 13 | 28 | - | 1 | 32 | - | 18 | 33 | - | 6 | 36 | - +----------------+-----+ - - -Example 5: Reverse with complex pipeline -======================================== - -This example shows reverse working with filtering and field selection. - -PPL query:: - - os> source=accounts | where age > 30 | fields account_number, age | reverse; - fetched rows / total rows = 3/3 - +----------------+-----+ - | account_number | age | - |----------------+-----| - | 6 | 36 | - | 18 | 33 | - | 1 | 32 | - +----------------+-----+ diff --git a/docs/user/ppl/cmd/rex.md b/docs/user/ppl/cmd/rex.md new file mode 100644 index 00000000000..0f117373d8e --- /dev/null +++ b/docs/user/ppl/cmd/rex.md @@ -0,0 +1,291 @@ +# rex + +## Description + +The `rex` command extracts fields from a raw text field using regular expression named capture groups. +## Syntax + +rex [mode=\] field=\ \ [max_match=\] [offset_field=\] +* field: mandatory. The field must be a string field to extract data from. +* pattern: mandatory string. The regular expression pattern with named capture groups used to extract new fields. Pattern must contain at least one named capture group using `(?pattern)` syntax. +* mode: optional. Either `extract` or `sed`. **Default:** extract + * **extract mode** (default): Creates new fields from regular expression named capture groups. This is the standard field extraction behavior. + * **sed mode**: Performs text substitution on the field using sed-style patterns + * `s/pattern/replacement/` - Replace first occurrence + * `s/pattern/replacement/g` - Replace all occurrences (global) + * `s/pattern/replacement/n` - Replace only the nth occurrence (where n is a number) + * `y/from_chars/to_chars/` - Character-by-character transliteration + * Backreferences: `\1`, `\2`, etc. reference captured groups in replacement +* max_match: optional integer (default=1). Maximum number of matches to extract. If greater than 1, extracted fields become arrays. The value 0 means unlimited matches, but is automatically capped to the configured limit (default: 10, configurable via `plugins.ppl.rex.max_match.limit`). +* offset_field: optional string. Field name to store the character offset positions of matches. Only available in extract mode. + +## Example 1: Basic Field Extraction + +This example shows extracting username and domain from email addresses using named capture groups. Both extracted fields are returned as string type. + +```ppl +source=accounts +| rex field=email "(?[^@]+)@(?[^.]+)" +| fields email, username, domain +| head 2 +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++-----------------------+------------+--------+ +| email | username | domain | +|-----------------------+------------+--------| +| amberduke@pyrami.com | amberduke | pyrami | +| hattiebond@netagy.com | hattiebond | netagy | ++-----------------------+------------+--------+ +``` + +## Example 2: Handling Non-matching Patterns + +This example shows the rex command returning all events, setting extracted fields to null for non-matching patterns. Extracted fields would be string type when matches are found. + +```ppl +source=accounts +| rex field=email "(?[^@]+)@(?gmail\\.com)" +| fields email, user, domain +| head 2 +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++-----------------------+------+--------+ +| email | user | domain | +|-----------------------+------+--------| +| amberduke@pyrami.com | null | null | +| hattiebond@netagy.com | null | null | ++-----------------------+------+--------+ +``` + +## Example 3: Multiple Matches with max_match + +This example shows extracting multiple words from address field using max_match parameter. The extracted field is returned as an array type containing string elements. + +```ppl +source=accounts +| rex field=address "(?[A-Za-z]+)" max_match=2 +| fields address, words +| head 3 +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++--------------------+------------------+ +| address | words | +|--------------------+------------------| +| 880 Holmes Lane | [Holmes,Lane] | +| 671 Bristol Street | [Bristol,Street] | +| 789 Madison Street | [Madison,Street] | ++--------------------+------------------+ +``` + +## Example 4: Text Replacement with mode=sed + +This example shows replacing email domains using sed mode for text substitution. The extracted field is returned as string type. + +```ppl +source=accounts +| rex field=email mode=sed "s/@.*/@company.com/" +| fields email +| head 2 +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++------------------------+ +| email | +|------------------------| +| amberduke@company.com | +| hattiebond@company.com | ++------------------------+ +``` + +## Example 5: Using offset_field + +This example shows tracking the character positions where matches occur. Extracted fields are string type, and the offset_field is also string type. + +```ppl +source=accounts +| rex field=email "(?[^@]+)@(?[^.]+)" offset_field=matchpos +| fields email, username, domain, matchpos +| head 2 +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++-----------------------+------------+--------+---------------------------+ +| email | username | domain | matchpos | +|-----------------------+------------+--------+---------------------------| +| amberduke@pyrami.com | amberduke | pyrami | domain=10-15&username=0-8 | +| hattiebond@netagy.com | hattiebond | netagy | domain=11-16&username=0-9 | ++-----------------------+------------+--------+---------------------------+ +``` + +## Example 6: Complex Email Pattern + +This example shows extracting comprehensive email components including top-level domain. All extracted fields are returned as string type. + +```ppl +source=accounts +| rex field=email "(?[a-zA-Z0-9._%+-]+)@(?[a-zA-Z0-9.-]+)\\.(?[a-zA-Z]{2,})" +| fields email, user, domain, tld +| head 2 +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++-----------------------+------------+--------+-----+ +| email | user | domain | tld | +|-----------------------+------------+--------+-----| +| amberduke@pyrami.com | amberduke | pyrami | com | +| hattiebond@netagy.com | hattiebond | netagy | com | ++-----------------------+------------+--------+-----+ +``` + +## Example 7: Chaining Multiple rex Commands + +This example shows extracting initial letters from both first and last names. All extracted fields are returned as string type. + +```ppl +source=accounts +| rex field=firstname "(?^.)" +| rex field=lastname "(?^.)" +| fields firstname, lastname, firstinitial, lastinitial +| head 3 +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++-----------+----------+--------------+-------------+ +| firstname | lastname | firstinitial | lastinitial | +|-----------+----------+--------------+-------------| +| Amber | Duke | A | D | +| Hattie | Bond | H | B | +| Nanette | Bates | N | B | ++-----------+----------+--------------+-------------+ +``` + +## Example 8: Named Capture Group Limitations + +This example demonstrates naming restrictions for capture groups. Group names cannot contain underscores due to Java regex limitations. +Invalid PPL query with underscores + +```ppl +source=accounts +| rex field=email "(?[^@]+)@(?[^.]+)" +| fields email, user_name, email_domain +``` + +Expected output: + +```text +{'reason': 'Invalid Query', 'details': "Invalid capture group name 'user_name'. Java regex group names must start with a letter and contain only letters and digits.", 'type': 'IllegalArgumentException'} +Error: Query returned no data +``` + +Correct PPL query without underscores + +```ppl +source=accounts +| rex field=email "(?[^@]+)@(?[^.]+)" +| fields email, username, emaildomain +| head 2 +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++-----------------------+------------+-------------+ +| email | username | emaildomain | +|-----------------------+------------+-------------| +| amberduke@pyrami.com | amberduke | pyrami | +| hattiebond@netagy.com | hattiebond | netagy | ++-----------------------+------------+-------------+ +``` + +## Example 9: Max Match Limit Protection + +This example demonstrates the max_match limit protection mechanism. When max_match=0 (unlimited) is specified, the system automatically caps it to prevent memory exhaustion. +PPL query with max_match=0 automatically capped to default limit of 10 + +```ppl +source=accounts +| rex field=address "(?\\d*)" max_match=0 +| eval digit_count=array_length(digit) +| fields address, digit_count +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------+-------------+ +| address | digit_count | +|-----------------+-------------| +| 880 Holmes Lane | 10 | ++-----------------+-------------+ +``` + +PPL query exceeding the configured limit results in an error + +```ppl +source=accounts +| rex field=address "(?\\d*)" max_match=100 +| fields address, digit +| head 1 +``` + +Expected output: + +```text +{'reason': 'Invalid Query', 'details': 'Rex command max_match value (100) exceeds the configured limit (10). Consider using a smaller max_match value or adjust the plugins.ppl.rex.max_match.limit setting.', 'type': 'IllegalArgumentException'} +Error: Query returned no data +``` + +## Comparison with Related Commands + +| Feature | rex | parse | +| --- | --- | --- | +| Pattern Type | Java Regex | Java Regex | +| Named Groups Required | Yes | Yes | +| Multiple Named Groups | Yes | No | +| Multiple Matches | Yes | No | +| Text Substitution | Yes | No | +| Offset Tracking | Yes | No | +| Special Characters in Group Names | No | No | + +## Limitations + +**Named Capture Group Naming:** +* Group names must start with a letter and contain only letters and digits +* For detailed Java regex pattern syntax and usage, refer to the [official Java Pattern documentation](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html) + +**Pattern Requirements:** +* Pattern must contain at least one named capture group +* Regular capture groups `(...)` without names are not allowed + +**Max Match Limit:** +* The `max_match` parameter is subject to a configurable system limit to prevent memory exhaustion +* When `max_match=0` (unlimited) is specified, it is automatically capped at the configured limit (default: 10) +* User-specified values exceeding the configured limit will result in an error +* Users can adjust the limit via the `plugins.ppl.rex.max_match.limit` cluster setting. Setting this limit to a large value is not recommended as it can lead to excessive memory consumption, especially with patterns that match empty strings (e.g., `\d*`, `\w*`) \ No newline at end of file diff --git a/docs/user/ppl/cmd/rex.rst b/docs/user/ppl/cmd/rex.rst deleted file mode 100644 index 3dec7f26c42..00000000000 --- a/docs/user/ppl/cmd/rex.rst +++ /dev/null @@ -1,235 +0,0 @@ -=== -rex -=== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| The ``rex`` command extracts fields from a raw text field using regular expression named capture groups. - -Syntax -====== -rex [mode=] field= [max_match=] [offset_field=] - -* field: mandatory. The field must be a string field to extract data from. -* pattern: mandatory string. The regular expression pattern with named capture groups used to extract new fields. Pattern must contain at least one named capture group using ``(?pattern)`` syntax. -* mode: optional. Either ``extract`` or ``sed``. **Default:** extract - - * **extract mode** (default): Creates new fields from regular expression named capture groups. This is the standard field extraction behavior. - * **sed mode**: Performs text substitution on the field using sed-style patterns - - * ``s/pattern/replacement/`` - Replace first occurrence - * ``s/pattern/replacement/g`` - Replace all occurrences (global) - * ``s/pattern/replacement/n`` - Replace only the nth occurrence (where n is a number) - * ``y/from_chars/to_chars/`` - Character-by-character transliteration - * Backreferences: ``\1``, ``\2``, etc. reference captured groups in replacement - -* max_match: optional integer (default=1). Maximum number of matches to extract. If greater than 1, extracted fields become arrays. The value 0 means unlimited matches, but is automatically capped to the configured limit (default: 10, configurable via ``plugins.ppl.rex.max_match.limit``). -* offset_field: optional string. Field name to store the character offset positions of matches. Only available in extract mode. - -Example 1: Basic Field Extraction -================================= - -This example shows extracting username and domain from email addresses using named capture groups. Both extracted fields are returned as string type. - -PPL query:: - - os> source=accounts | rex field=email "(?[^@]+)@(?[^.]+)" | fields email, username, domain | head 2 ; - fetched rows / total rows = 2/2 - +-----------------------+------------+--------+ - | email | username | domain | - |-----------------------+------------+--------| - | amberduke@pyrami.com | amberduke | pyrami | - | hattiebond@netagy.com | hattiebond | netagy | - +-----------------------+------------+--------+ - - -Example 2: Handling Non-matching Patterns -========================================= - -This example shows the rex command returning all events, setting extracted fields to null for non-matching patterns. Extracted fields would be string type when matches are found. - -PPL query:: - - os> source=accounts | rex field=email "(?[^@]+)@(?gmail\\.com)" | fields email, user, domain | head 2 ; - fetched rows / total rows = 2/2 - +-----------------------+------+--------+ - | email | user | domain | - |-----------------------+------+--------| - | amberduke@pyrami.com | null | null | - | hattiebond@netagy.com | null | null | - +-----------------------+------+--------+ - - -Example 3: Multiple Matches with max_match -========================================== - -This example shows extracting multiple words from address field using max_match parameter. The extracted field is returned as an array type containing string elements. - -PPL query:: - - os> source=accounts | rex field=address "(?[A-Za-z]+)" max_match=2 | fields address, words | head 3 ; - fetched rows / total rows = 3/3 - +--------------------+------------------+ - | address | words | - |--------------------+------------------| - | 880 Holmes Lane | [Holmes,Lane] | - | 671 Bristol Street | [Bristol,Street] | - | 789 Madison Street | [Madison,Street] | - +--------------------+------------------+ - - -Example 4: Text Replacement with mode=sed -========================================= - -This example shows replacing email domains using sed mode for text substitution. The extracted field is returned as string type. - -PPL query:: - - os> source=accounts | rex field=email mode=sed "s/@.*/@company.com/" | fields email | head 2 ; - fetched rows / total rows = 2/2 - +------------------------+ - | email | - |------------------------| - | amberduke@company.com | - | hattiebond@company.com | - +------------------------+ - - -Example 5: Using offset_field -============================= - -This example shows tracking the character positions where matches occur. Extracted fields are string type, and the offset_field is also string type. - -PPL query:: - - os> source=accounts | rex field=email "(?[^@]+)@(?[^.]+)" offset_field=matchpos | fields email, username, domain, matchpos | head 2 ; - fetched rows / total rows = 2/2 - +-----------------------+------------+--------+---------------------------+ - | email | username | domain | matchpos | - |-----------------------+------------+--------+---------------------------| - | amberduke@pyrami.com | amberduke | pyrami | domain=10-15&username=0-8 | - | hattiebond@netagy.com | hattiebond | netagy | domain=11-16&username=0-9 | - +-----------------------+------------+--------+---------------------------+ - - -Example 6: Complex Email Pattern -================================ - -This example shows extracting comprehensive email components including top-level domain. All extracted fields are returned as string type. - -PPL query:: - - os> source=accounts | rex field=email "(?[a-zA-Z0-9._%+-]+)@(?[a-zA-Z0-9.-]+)\\.(?[a-zA-Z]{2,})" | fields email, user, domain, tld | head 2 ; - fetched rows / total rows = 2/2 - +-----------------------+------------+--------+-----+ - | email | user | domain | tld | - |-----------------------+------------+--------+-----| - | amberduke@pyrami.com | amberduke | pyrami | com | - | hattiebond@netagy.com | hattiebond | netagy | com | - +-----------------------+------------+--------+-----+ - - -Example 7: Chaining Multiple rex Commands -========================================= - -This example shows extracting initial letters from both first and last names. All extracted fields are returned as string type. - -PPL query:: - - os> source=accounts | rex field=firstname "(?^.)" | rex field=lastname "(?^.)" | fields firstname, lastname, firstinitial, lastinitial | head 3 ; - fetched rows / total rows = 3/3 - +-----------+----------+--------------+-------------+ - | firstname | lastname | firstinitial | lastinitial | - |-----------+----------+--------------+-------------| - | Amber | Duke | A | D | - | Hattie | Bond | H | B | - | Nanette | Bates | N | B | - +-----------+----------+--------------+-------------+ - - -Example 8: Named Capture Group Limitations -========================================== - -This example demonstrates naming restrictions for capture groups. Group names cannot contain underscores due to Java regex limitations. - -Invalid PPL query with underscores:: - - os> source=accounts | rex field=email "(?[^@]+)@(?[^.]+)" | fields email, user_name, email_domain ; - {'reason': 'Invalid Query', 'details': "Invalid capture group name 'user_name'. Java regex group names must start with a letter and contain only letters and digits.", 'type': 'IllegalArgumentException'} - Error: Query returned no data - -Correct PPL query without underscores:: - - os> source=accounts | rex field=email "(?[^@]+)@(?[^.]+)" | fields email, username, emaildomain | head 2 ; - fetched rows / total rows = 2/2 - +-----------------------+------------+-------------+ - | email | username | emaildomain | - |-----------------------+------------+-------------| - | amberduke@pyrami.com | amberduke | pyrami | - | hattiebond@netagy.com | hattiebond | netagy | - +-----------------------+------------+-------------+ - - -Example 9: Max Match Limit Protection -===================================== - -This example demonstrates the max_match limit protection mechanism. When max_match=0 (unlimited) is specified, the system automatically caps it to prevent memory exhaustion. - -PPL query with max_match=0 automatically capped to default limit of 10:: - - os> source=accounts | rex field=address "(?\\d*)" max_match=0 | eval digit_count=array_length(digit) | fields address, digit_count | head 1 ; - fetched rows / total rows = 1/1 - +-----------------+-------------+ - | address | digit_count | - |-----------------+-------------| - | 880 Holmes Lane | 10 | - +-----------------+-------------+ - -PPL query exceeding the configured limit results in an error:: - - os> source=accounts | rex field=address "(?\\d*)" max_match=100 | fields address, digit | head 1 ; - {'reason': 'Invalid Query', 'details': 'Rex command max_match value (100) exceeds the configured limit (10). Consider using a smaller max_match value or adjust the plugins.ppl.rex.max_match.limit setting.', 'type': 'IllegalArgumentException'} - Error: Query returned no data - - -Comparison with Related Commands -================================ - -================================== ============ ============ -Feature rex parse -================================== ============ ============ -Pattern Type Java Regex Java Regex -Named Groups Required Yes Yes -Multiple Named Groups Yes No -Multiple Matches Yes No -Text Substitution Yes No -Offset Tracking Yes No -Special Characters in Group Names No No -================================== ============ ============ - - -Limitations -=========== -**Named Capture Group Naming:** - -* Group names must start with a letter and contain only letters and digits -* For detailed Java regex pattern syntax and usage, refer to the `official Java Pattern documentation `_ - -**Pattern Requirements:** - -* Pattern must contain at least one named capture group -* Regular capture groups ``(...)`` without names are not allowed - -**Max Match Limit:** - -* The ``max_match`` parameter is subject to a configurable system limit to prevent memory exhaustion -* When ``max_match=0`` (unlimited) is specified, it is automatically capped at the configured limit (default: 10) -* User-specified values exceeding the configured limit will result in an error -* Users can adjust the limit via the ``plugins.ppl.rex.max_match.limit`` cluster setting. Setting this limit to a large value is not recommended as it can lead to excessive memory consumption, especially with patterns that match empty strings (e.g., ``\d*``, ``\w*``) \ No newline at end of file diff --git a/docs/user/ppl/cmd/search.md b/docs/user/ppl/cmd/search.md new file mode 100644 index 00000000000..f05f47aa196 --- /dev/null +++ b/docs/user/ppl/cmd/search.md @@ -0,0 +1,745 @@ +# search + +## Description + +The `search` command retrieves document from the index. The `search` command can only be used as the first command in the PPL query. +## Syntax + +search source=[\:]\ [search-expression] +* search: search keyword, which could be ignored. +* index: mandatory. search command must specify which index to query from. The index name can be prefixed by "\:" for cross-cluster search. +* search-expression: optional. Search expression that gets converted to OpenSearch [query_string](https://docs.opensearch.org/latest/query-dsl/full-text/query-string/) function which uses [Lucene Query Syntax](https://lucene.apache.org/core/2_9_4/queryparsersyntax.html). + +## Search Expression + +The search expression syntax supports: +* **Full text search**: `error` or `"error message"` - Searches the default field configured by the `index.query.default_field` setting (defaults to `*` which searches all fields) +* **Field-value comparisons**: `field=value`, `field!=value`, `field>value`, `field>=value`, `field[+<...>]@` - Time offset from current time + +**Relative Time Components**: +* **Time offset**: `+` (future) or `-` (past) +* **Time amount**: Numeric value + time unit (`second`, `minute`, `hour`, `day`, `week`, `month`, `year`, and their variants) +* **Snap to unit**: Optional `@` to round to nearest unit (hour, day, month, etc.) + +**Examples of Time Modifier Values**: +* `earliest=now` - From current time +* `latest='2024-12-31 23:59:59'` - Until a specific date +* `earliest=-7d` - From 7 days ago +* `latest='+1d@d'` - Until tomorrow at start of day +* `earliest='-1month@month'` - From start of previous month +* `latest=1754020061` - Until a unix timestamp (August 1, 2025 03:47:41 at UTC) + +Read more details on time modifiers in the [PPL relative_timestamp documentation](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/ppl-lang/functions/ppl-datetime.md#relative_timestamp). +**Notes:** +* **Column name conflicts**: If your data contains columns named "earliest" or "latest", use backticks to access them as regular fields (e.g., `` `earliest`="value"``) to avoid conflicts with time modifier syntax. +* **Time snap syntax**: Time modifiers with chained time offsets must be wrapped in quotes (e.g., `latest='+1d@month-10h'`) for proper query parsing. + +## Default Field Configuration + +When you search without specifying a field, it searches the default field configured by the `index.query.default_field` index setting (defaults to `*` which searches all fields). +You can check or modify the default field setting + GET /accounts/_settings/index.query.default_field + PUT /accounts/_settings + { + "index.query.default_field": "firstname,lastname,email" + } +## Field Types and Search Behavior + +**Text Fields**: Full-text search, phrase search +* `search message="error occurred" source=logs` +* Limitations: Wildcards apply to terms after analysis, not entire field value. + +**Keyword Fields**: Exact matching, wildcard patterns +* `search status="ACTIVE" source=logs` +* Limitations: No text analysis, case-sensitive matching + +**Numeric Fields**: Range queries, exact matching, IN operator +* `search age>=18 AND balance<50000 source=accounts` +* Limitations: No wildcard or text search support + +**Date Fields**: Range queries, exact matching, IN operator +* `search timestamp>="2024-01-01" source=logs` +* Limitations: Must use index mapping date format, no wildcards + +**Boolean Fields**: true/false values only, exact matching, IN operator +* `search active=true source=users` +* Limitations: No wildcards or range queries + +**IP Fields**: Exact matching, CIDR notation +* `search client_ip="192.168.1.0/24" source=logs` +* Limitations: No wildcards for partial IP matching. For wildcard search use multi field with keyword: `search ip_address.keyword='1*' source=logs` or WHERE clause: `source=logs | where cast(ip_address as string) like '1%'` + +**Field Type Performance Tips**: + * Each field type has specific search capabilities and limitations. Using the wrong field type during ingestion impacts performance and accuracy + * For wildcard searches on non-keyword fields: Add a keyword field copy for better performance. Example: If you need wildcards on a text field, create `message.keyword` alongside `message` + +## Cross-Cluster Search + +Cross-cluster search lets any node in a cluster execute search requests against other clusters. Refer to [Cross-Cluster Search](../admin/cross_cluster_search.md) for configuration. +## Example 1: Text Search + +**Basic Text Search** (unquoted single term) + +```ppl +search ERROR source=otellogs +| sort @timestamp +| fields severityText, body +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------+---------------------------------------------------------+ +| severityText | body | +|--------------+---------------------------------------------------------| +| ERROR | Payment failed: Insufficient funds for user@example.com | ++--------------+---------------------------------------------------------+ +``` + +**Phrase Search** (requires quotes for multi-word exact match) + +```ppl +search "Payment failed" source=otellogs +| fields body +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------------------------------------------------+ +| body | +|---------------------------------------------------------| +| Payment failed: Insufficient funds for user@example.com | ++---------------------------------------------------------+ +``` + +**Implicit AND with Multiple Terms** (unquoted literals are combined with AND) + +```ppl +search user email source=otellogs +| sort @timestamp +| fields body +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------------------------------------------------------------------------------------------------------+ +| body | +|--------------------------------------------------------------------------------------------------------------------| +| Executing SQL: SELECT * FROM users WHERE email LIKE '%@gmail.com' AND status != 'deleted' ORDER BY created_at DESC | ++--------------------------------------------------------------------------------------------------------------------+ +``` + +Note: `search user email` is equivalent to `search user AND email`. Multiple unquoted terms are automatically combined with AND. +**Enclose in double quotes for terms which contain special characters** + +```ppl +search "john.doe+newsletter@company.com" source=otellogs +| fields body +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------------------------------------------------------------------------------------------------------+ +| body | +|--------------------------------------------------------------------------------------------------------------------| +| Email notification sent to john.doe+newsletter@company.com with subject: 'Welcome! Your order #12345 is confirmed' | ++--------------------------------------------------------------------------------------------------------------------+ +``` + +### Mixed Phrase and Boolean + +```ppl +search "User authentication" OR OAuth2 source=otellogs +| sort @timestamp +| fields body +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------------------------------------------------------------------------------------------------+ +| body | +|----------------------------------------------------------------------------------------------------------| +| [2024-01-15 10:30:09] production.INFO: User authentication successful for admin@company.org using OAuth2 | ++----------------------------------------------------------------------------------------------------------+ +``` + +## Example 2: Boolean Logic and Operator Precedence + +### Boolean Operators + +```ppl +search severityText="ERROR" OR severityText="FATAL" source=otellogs +| sort @timestamp +| fields severityText +| head 3 +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++--------------+ +| severityText | +|--------------| +| ERROR | +| FATAL | +| ERROR | ++--------------+ +``` + +```ppl +search severityText="INFO" AND `resource.attributes.service.name`="cart-service" source=otellogs +| fields body +| head 1; +``` + +Expected output + +```text +fetched rows / total rows = 1/1 ++----------------------------------------------------------------------------------+ +| body | +|----------------------------------------------------------------------------------| +| User e1ce63e6-8501-11f0-930d-c2fcbdc05f14 adding 4 of product HQTGWGPNH4 to cart | ++----------------------------------------------------------------------------------+ +``` + +**Operator Precedence** (highest to lowest): Parentheses → NOT → OR → AND + +```ppl +search severityText="ERROR" OR severityText="WARN" AND severityNumber>15 source=otellogs +| sort @timestamp +| fields severityText, severityNumber +| head 2 +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++--------------+----------------+ +| severityText | severityNumber | +|--------------+----------------| +| ERROR | 17 | +| ERROR | 17 | ++--------------+----------------+ +``` + +The above evaluates as `(severityText="ERROR" OR severityText="WARN") AND severityNumber>15` +## Example 3: NOT vs != Semantics + +**!= operator** (field must exist and not equal the value) + +```ppl +search employer!="Quility" source=accounts +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++----------------+-----------+--------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+ +| account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | +|----------------+-----------+--------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------| +| 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | +| 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | ++----------------+-----------+--------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+ +``` + +**NOT operator** (excludes matching conditions, includes null fields) + +```ppl +search NOT employer="Quility" source=accounts +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+ +| account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | +|----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------| +| 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | +| 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | +| 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | ++----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+ +``` + +**Key difference**: `!=` excludes null values, `NOT` includes them. +Dale Adams (account 18) has `employer=null`. He appears in `NOT employer="Quility"` but not in `employer!="Quility"`. +## Example 4: Wildcards + +### Wildcard Patterns + +```ppl +search severityText=ERR* source=otellogs +| sort @timestamp +| fields severityText +| head 3 +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++--------------+ +| severityText | +|--------------| +| ERROR | +| ERROR | +| ERROR2 | ++--------------+ +``` + +```ppl +search body=user* source=otellogs +| sort @timestamp +| fields body +| head 2; +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++----------------------------------------------------------------------------------+ +| body | +|----------------------------------------------------------------------------------| +| User e1ce63e6-8501-11f0-930d-c2fcbdc05f14 adding 4 of product HQTGWGPNH4 to cart | +| Payment failed: Insufficient funds for user@example.com | ++----------------------------------------------------------------------------------+ +``` + +**Wildcard Rules**: +* `*` - Matches zero or more characters +* `?` - Matches exactly one character + +### Single character wildcard (?) + +```ppl +search severityText="INFO?" source=otellogs +| sort @timestamp +| fields severityText +| head 3 +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++--------------+ +| severityText | +|--------------| +| INFO2 | +| INFO3 | +| INFO4 | ++--------------+ +``` + +## Example 5: Range Queries + +Use comparison operators (>, <, >=, <=) to filter numeric and date fields within specific ranges. Range queries are particularly useful for filtering by age, price, timestamps, or any numeric metrics. + +```ppl +search severityNumber>15 AND severityNumber<=20 source=otellogs +| sort @timestamp +| fields severityNumber +| head 3 +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++----------------+ +| severityNumber | +|----------------| +| 17 | +| 17 | +| 18 | ++----------------+ +``` + +```ppl +search `attributes.payment.amount`>=1000.0 AND `attributes.payment.amount`<=2000.0 source=otellogs +| fields body; +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------------------------------------------------+ +| body | +|---------------------------------------------------------| +| Payment failed: Insufficient funds for user@example.com | ++---------------------------------------------------------+ +``` + +## Example 6: Field Search with Wildcards + +When searching in text or keyword fields, wildcards enable partial matching. This is particularly useful for finding records where you only know part of the value. Note that wildcards work best with keyword fields, while text fields may produce unexpected results due to tokenization. +**Partial Search in Keyword Fields** + +```ppl +search employer=Py* source=accounts +| fields firstname, employer +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------+----------+ +| firstname | employer | +|-----------+----------| +| Amber | Pyrami | ++-----------+----------+ +``` + +### Combining Wildcards with Field Comparisons + +```ppl +search firstname=A* AND age>30 source=accounts +| fields firstname, age, city +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------+-----+--------+ +| firstname | age | city | +|-----------+-----+--------| +| Amber | 32 | Brogan | ++-----------+-----+--------+ +``` + +**Important Notes on Wildcard Usage**: +* **Keyword fields**: Best for wildcard searches - exact value matching with pattern support +* **Text fields**: Wildcards apply to individual tokens after analysis, not the entire field value +* **Performance**: Leading wildcards (e.g., `*@example.com`) are slower than trailing wildcards +* **Case sensitivity**: Keyword field wildcards are case-sensitive unless normalized during indexing + +## Example 7: IN Operator and Field Comparisons + +The IN operator efficiently checks if a field matches any value from a list. This is cleaner and more performant than chaining multiple OR conditions for the same field. +**IN Operator** + +```ppl +search severityText IN ("ERROR", "WARN", "FATAL") source=otellogs +| sort @timestamp +| fields severityText +| head 3 +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++--------------+ +| severityText | +|--------------| +| ERROR | +| WARN | +| FATAL | ++--------------+ +``` + +### Field Comparison Examples + +```ppl +search severityNumber=17 source=otellogs +| sort @timestamp +| fields body +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------------------------------------------------+ +| body | +|---------------------------------------------------------| +| Payment failed: Insufficient funds for user@example.com | ++---------------------------------------------------------+ +``` + +```ppl +search `attributes.user.email`="user@example.com" source=otellogs +| fields body; +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------------------------------------------------+ +| body | +|---------------------------------------------------------| +| Payment failed: Insufficient funds for user@example.com | ++---------------------------------------------------------+ +``` + +## Example 8: Complex Expressions + +Combine multiple conditions using boolean operators and parentheses to create sophisticated search queries. + +```ppl +search (severityText="ERROR" OR severityText="WARN") AND severityNumber>10 source=otellogs +| sort @timestamp +| fields severityText +| head 3 +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++--------------+ +| severityText | +|--------------| +| ERROR | +| WARN | +| ERROR | ++--------------+ +``` + +```ppl +search `attributes.user.email`="user@example.com" OR (`attributes.error.code`="INSUFFICIENT_FUNDS" AND severityNumber>15) source=otellogs +| fields body; +``` + +Expected output: + +``` +fetched rows / total rows = 1/1 ++---------------------------------------------------------+ +| body | +|---------------------------------------------------------| +| Payment failed: Insufficient funds for user@example.com | ++---------------------------------------------------------+ +``` + +## Example 9: Time Modifiers + +Time modifiers filter search results by time range using the implicit `@timestamp` field. They support various time formats for precise temporal filtering. +**Absolute Time Filtering** + +```ppl +search earliest='2024-01-15 10:30:05' latest='2024-01-15 10:30:10' source=otellogs +| fields @timestamp, severityText +``` + +Expected output: + +```text +fetched rows / total rows = 6/6 ++-------------------------------+--------------+ +| @timestamp | severityText | +|-------------------------------+--------------| +| 2024-01-15 10:30:05.678901234 | FATAL | +| 2024-01-15 10:30:06.789012345 | TRACE | +| 2024-01-15 10:30:07.890123456 | ERROR | +| 2024-01-15 10:30:08.901234567 | WARN | +| 2024-01-15 10:30:09.012345678 | INFO | +| 2024-01-15 10:30:10.123456789 | TRACE2 | ++-------------------------------+--------------+ +``` + +**Relative Time Filtering** (before 30 seconds ago) + +```ppl +search latest=-30s source=otellogs +| sort @timestamp +| fields @timestamp, severityText +| head 3 +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++-------------------------------+--------------+ +| @timestamp | severityText | +|-------------------------------+--------------| +| 2024-01-15 10:30:00.123456789 | INFO | +| 2024-01-15 10:30:01.23456789 | ERROR | +| 2024-01-15 10:30:02.345678901 | WARN | ++-------------------------------+--------------+ +``` + +**Time Snapping** (before start of current minute) + +```ppl +search latest='@m' source=otellogs +| fields @timestamp, severityText +| head 2 +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++-------------------------------+--------------+ +| @timestamp | severityText | +|-------------------------------+--------------| +| 2024-01-15 10:30:00.123456789 | INFO | +| 2024-01-15 10:30:01.23456789 | ERROR | ++-------------------------------+--------------+ +``` + +### Unix Timestamp Filtering + +```ppl +search earliest=1705314600 latest=1705314605 source=otellogs +| fields @timestamp, severityText +``` + +Expected output: + +```text +fetched rows / total rows = 5/5 ++-------------------------------+--------------+ +| @timestamp | severityText | +|-------------------------------+--------------| +| 2024-01-15 10:30:00.123456789 | INFO | +| 2024-01-15 10:30:01.23456789 | ERROR | +| 2024-01-15 10:30:02.345678901 | WARN | +| 2024-01-15 10:30:03.456789012 | DEBUG | +| 2024-01-15 10:30:04.567890123 | INFO | ++-------------------------------+--------------+ +``` + +## Example 10: Special Characters and Escaping + +Understand when and how to escape special characters in your search queries. There are two categories of characters that need escaping: +**Characters that must be escaped**: +* **Backslashes (\)**: Always escape as `\\` to search for literal backslash +* **Quotes (")**: Escape as `\"` when inside quoted strings + +**Wildcard characters (escape only to search literally)**: +* **Asterisk (*)**: Use as-is for wildcard, escape as `\\*` to search for literal asterisk +* **Question mark (?)**: Use as-is for wildcard, escape as `\\?` to search for literal question mark + + +| Intent | PPL Syntax | Result | +|--------|------------|--------| +| Wildcard search | `field=user*` | Matches "user", "user123", "userABC" | +| Literal "user*" | `field="user\\*"` | Matches only "user*" | +| Wildcard search | `field=log?` | Matches "log1", "logA", "logs" | +| Literal "log?" | `field="log\\?"` | Matches only "log?" | + + +**Backslash in file paths** + +```ppl +search `attributes.error.type`="C:\\\\Users\\\\admin" source=otellogs +| fields `attributes.error.type` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------+ +| attributes.error.type | +|-----------------------| +| C:\Users\admin | ++-----------------------+ +``` + +Note: Each backslash in the search value needs to be escaped with another backslash. When using REST API with JSON, additional JSON escaping is required. +**Quotes within strings** + +```ppl +search body="\"exact phrase\"" source=otellogs +| sort @timestamp +| fields body +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------------------------------------------------------------------------------------------------------------------------------------------+ +| body | +|--------------------------------------------------------------------------------------------------------------------------------------------------------| +| Query contains Lucene special characters: +field:value -excluded AND (grouped OR terms) NOT "exact phrase" wildcard* fuzzy~2 /regex/ [range TO search] | ++--------------------------------------------------------------------------------------------------------------------------------------------------------+ +``` + +**Text with special characters** + +```ppl +search "wildcard\\* fuzzy~2" source=otellogs +| fields body +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------------------------------------------------------------------------------------------------------------------------------------------+ +| body | +|--------------------------------------------------------------------------------------------------------------------------------------------------------| +| Query contains Lucene special characters: +field:value -excluded AND (grouped OR terms) NOT "exact phrase" wildcard* fuzzy~2 /regex/ [range TO search] | ++--------------------------------------------------------------------------------------------------------------------------------------------------------+ +``` + +## Example 11: Fetch All Data + +Retrieve all documents from an index by specifying only the source without any search conditions. This is useful for exploring small datasets or verifying data ingestion. + +```ppl +source=accounts +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+ +| account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | +|----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------| +| 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | +| 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | +| 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates | +| 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | ++----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+ +``` + \ No newline at end of file diff --git a/docs/user/ppl/cmd/search.rst b/docs/user/ppl/cmd/search.rst deleted file mode 100644 index 31aa28cc46d..00000000000 --- a/docs/user/ppl/cmd/search.rst +++ /dev/null @@ -1,556 +0,0 @@ -====== -search -====== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| The ``search`` command retrieves document from the index. The ``search`` command can only be used as the first command in the PPL query. - - -Syntax -====== -search source=[:] [search-expression] - -* search: search keyword, which could be ignored. -* index: mandatory. search command must specify which index to query from. The index name can be prefixed by ":" for cross-cluster search. -* search-expression: optional. Search expression that gets converted to OpenSearch `query_string `_ function which uses `Lucene Query Syntax `_. - -Search Expression -================= - -The search expression syntax supports: - -* **Full text search**: ``error`` or ``"error message"`` - Searches the default field configured by the ``index.query.default_field`` setting (defaults to ``*`` which searches all fields) -* **Field-value comparisons**: ``field=value``, ``field!=value``, ``field>value``, ``field>=value``, ``field`_: - -1. **Current time**: ``now`` or ``now()`` - the current time -2. **Absolute format**: ``MM/dd/yyyy:HH:mm:ss`` or ``yyyy-MM-dd HH:mm:ss`` -3. **Unix timestamp**: Numeric values (seconds since epoch) like ``1754020060.123`` -4. **Relative format**: ``(+|-)[+<...>]@`` - Time offset from current time - -**Relative Time Components**: - -* **Time offset**: ``+`` (future) or ``-`` (past) -* **Time amount**: Numeric value + time unit (``second``, ``minute``, ``hour``, ``day``, ``week``, ``month``, ``year``, and their variants) -* **Snap to unit**: Optional ``@`` to round to nearest unit (hour, day, month, etc.) - -**Examples of Time Modifier Values**: - -* ``earliest=now`` - From current time -* ``latest='2024-12-31 23:59:59'`` - Until a specific date -* ``earliest=-7d`` - From 7 days ago -* ``latest='+1d@d'`` - Until tomorrow at start of day -* ``earliest='-1month@month'`` - From start of previous month -* ``latest=1754020061`` - Until a unix timestamp (August 1, 2025 03:47:41 at UTC) - -Read more details on time modifiers `here `_. - -**Notes:** - -* **Column name conflicts**: If your data contains columns named "earliest" or "latest", use backticks to access them as regular fields (e.g., ```earliest`="value"``) to avoid conflicts with time modifier syntax. -* **Time snap syntax**: Time modifiers with chained time offsets must be wrapped in quotes (e.g., ``latest='+1d@month-10h'``) for proper query parsing. - -Default Field Configuration -=========================== -When you search without specifying a field, it searches the default field configured by the ``index.query.default_field`` index setting (defaults to ``*`` which searches all fields). - -You can check or modify the default field setting:: - - GET /accounts/_settings/index.query.default_field - - PUT /accounts/_settings - { - "index.query.default_field": "firstname,lastname,email" - } - -Field Types and Search Behavior -=============================== - -**Text Fields**: Full-text search, phrase search - -* ``search message="error occurred" source=logs`` - -* Limitations: Wildcards apply to terms after analysis, not entire field value. - -**Keyword Fields**: Exact matching, wildcard patterns - -* ``search status="ACTIVE" source=logs`` - -* Limitations: No text analysis, case-sensitive matching - -**Numeric Fields**: Range queries, exact matching, IN operator - -* ``search age>=18 AND balance<50000 source=accounts`` - -* Limitations: No wildcard or text search support - -**Date Fields**: Range queries, exact matching, IN operator - -* ``search timestamp>="2024-01-01" source=logs`` - -* Limitations: Must use index mapping date format, no wildcards - -**Boolean Fields**: true/false values only, exact matching, IN operator - -* ``search active=true source=users`` - -* Limitations: No wildcards or range queries - -**IP Fields**: Exact matching, CIDR notation - -* ``search client_ip="192.168.1.0/24" source=logs`` - -* Limitations: No wildcards for partial IP matching. For wildcard search use multi field with keyword: ``search ip_address.keyword='1*' source=logs`` or WHERE clause: ``source=logs | where cast(ip_address as string) like '1%'`` - -**Field Type Performance Tips**: - - * Each field type has specific search capabilities and limitations. Using the wrong field type during ingestion impacts performance and accuracy - * For wildcard searches on non-keyword fields: Add a keyword field copy for better performance. Example: If you need wildcards on a text field, create ``message.keyword`` alongside ``message`` - -Cross-Cluster Search -==================== -Cross-cluster search lets any node in a cluster execute search requests against other clusters. Refer to `Cross-Cluster Search `_ for configuration. - -Example 1: Text Search -====================== - -**Basic Text Search** (unquoted single term):: - - os> search ERROR source=otellogs | sort @timestamp | fields severityText, body | head 1; - fetched rows / total rows = 1/1 - +--------------+---------------------------------------------------------+ - | severityText | body | - |--------------+---------------------------------------------------------| - | ERROR | Payment failed: Insufficient funds for user@example.com | - +--------------+---------------------------------------------------------+ - -**Phrase Search** (requires quotes for multi-word exact match):: - - os> search "Payment failed" source=otellogs | fields body; - fetched rows / total rows = 1/1 - +---------------------------------------------------------+ - | body | - |---------------------------------------------------------| - | Payment failed: Insufficient funds for user@example.com | - +---------------------------------------------------------+ - -**Implicit AND with Multiple Terms** (unquoted literals are combined with AND):: - - os> search user email source=otellogs | sort @timestamp | fields body | head 1; - fetched rows / total rows = 1/1 - +--------------------------------------------------------------------------------------------------------------------+ - | body | - |--------------------------------------------------------------------------------------------------------------------| - | Executing SQL: SELECT * FROM users WHERE email LIKE '%@gmail.com' AND status != 'deleted' ORDER BY created_at DESC | - +--------------------------------------------------------------------------------------------------------------------+ - -Note: ``search user email`` is equivalent to ``search user AND email``. Multiple unquoted terms are automatically combined with AND. - -**Enclose in double quotes for terms which contain special characters**:: - - os> search "john.doe+newsletter@company.com" source=otellogs | fields body; - fetched rows / total rows = 1/1 - +--------------------------------------------------------------------------------------------------------------------+ - | body | - |--------------------------------------------------------------------------------------------------------------------| - | Email notification sent to john.doe+newsletter@company.com with subject: 'Welcome! Your order #12345 is confirmed' | - +--------------------------------------------------------------------------------------------------------------------+ - -**Mixed Phrase and Boolean**:: - - os> search "User authentication" OR OAuth2 source=otellogs | sort @timestamp | fields body | head 1; - fetched rows / total rows = 1/1 - +----------------------------------------------------------------------------------------------------------+ - | body | - |----------------------------------------------------------------------------------------------------------| - | [2024-01-15 10:30:09] production.INFO: User authentication successful for admin@company.org using OAuth2 | - +----------------------------------------------------------------------------------------------------------+ - -Example 2: Boolean Logic and Operator Precedence -================================================= - -**Boolean Operators**:: - - os> search severityText="ERROR" OR severityText="FATAL" source=otellogs | sort @timestamp | fields severityText | head 3; - fetched rows / total rows = 3/3 - +--------------+ - | severityText | - |--------------| - | ERROR | - | FATAL | - | ERROR | - +--------------+ - - os> search severityText="INFO" AND `resource.attributes.service.name`="cart-service" source=otellogs | fields body | head 1; - fetched rows / total rows = 1/1 - +----------------------------------------------------------------------------------+ - | body | - |----------------------------------------------------------------------------------| - | User e1ce63e6-8501-11f0-930d-c2fcbdc05f14 adding 4 of product HQTGWGPNH4 to cart | - +----------------------------------------------------------------------------------+ - -**Operator Precedence** (highest to lowest): Parentheses → NOT → OR → AND:: - - os> search severityText="ERROR" OR severityText="WARN" AND severityNumber>15 source=otellogs | sort @timestamp | fields severityText, severityNumber | head 2; - fetched rows / total rows = 2/2 - +--------------+----------------+ - | severityText | severityNumber | - |--------------+----------------| - | ERROR | 17 | - | ERROR | 17 | - +--------------+----------------+ - -The above evaluates as ``(severityText="ERROR" OR severityText="WARN") AND severityNumber>15`` - -Example 3: NOT vs != Semantics -============================== - -**!= operator** (field must exist and not equal the value):: - - os> search employer!="Quility" source=accounts; - fetched rows / total rows = 2/2 - +----------------+-----------+--------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+ - | account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | - |----------------+-----------+--------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------| - | 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | - | 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | - +----------------+-----------+--------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+ - -**NOT operator** (excludes matching conditions, includes null fields):: - - os> search NOT employer="Quility" source=accounts; - fetched rows / total rows = 3/3 - +----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+ - | account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | - |----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------| - | 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | - | 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | - | 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | - +----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+ - -**Key difference**: ``!=`` excludes null values, ``NOT`` includes them. - -Dale Adams (account 18) has ``employer=null``. He appears in ``NOT employer="Quility"`` but not in ``employer!="Quility"``. - -Example 4: Wildcards -==================== - -**Wildcard Patterns**:: - - os> search severityText=ERR* source=otellogs | sort @timestamp | fields severityText | head 3; - fetched rows / total rows = 3/3 - +--------------+ - | severityText | - |--------------| - | ERROR | - | ERROR | - | ERROR2 | - +--------------+ - - os> search body=user* source=otellogs | sort @timestamp | fields body | head 2; - fetched rows / total rows = 2/2 - +----------------------------------------------------------------------------------+ - | body | - |----------------------------------------------------------------------------------| - | User e1ce63e6-8501-11f0-930d-c2fcbdc05f14 adding 4 of product HQTGWGPNH4 to cart | - | Payment failed: Insufficient funds for user@example.com | - +----------------------------------------------------------------------------------+ - -**Wildcard Rules**: - -* ``*`` - Matches zero or more characters -* ``?`` - Matches exactly one character - -**Single character wildcard (?)**:: - - os> search severityText="INFO?" source=otellogs | sort @timestamp | fields severityText | head 3; - fetched rows / total rows = 3/3 - +--------------+ - | severityText | - |--------------| - | INFO2 | - | INFO3 | - | INFO4 | - +--------------+ - - -Example 5: Range Queries -======================== - -Use comparison operators (>, <, >=, <=) to filter numeric and date fields within specific ranges. Range queries are particularly useful for filtering by age, price, timestamps, or any numeric metrics. - -:: - - os> search severityNumber>15 AND severityNumber<=20 source=otellogs | sort @timestamp | fields severityNumber | head 3; - fetched rows / total rows = 3/3 - +----------------+ - | severityNumber | - |----------------| - | 17 | - | 17 | - | 18 | - +----------------+ - - os> search `attributes.payment.amount`>=1000.0 AND `attributes.payment.amount`<=2000.0 source=otellogs | fields body; - fetched rows / total rows = 1/1 - +---------------------------------------------------------+ - | body | - |---------------------------------------------------------| - | Payment failed: Insufficient funds for user@example.com | - +---------------------------------------------------------+ - -Example 6: Field Search with Wildcards -====================================== - -When searching in text or keyword fields, wildcards enable partial matching. This is particularly useful for finding records where you only know part of the value. Note that wildcards work best with keyword fields, while text fields may produce unexpected results due to tokenization. - -**Partial Search in Keyword Fields**:: - - os> search employer=Py* source=accounts | fields firstname, employer; - fetched rows / total rows = 1/1 - +-----------+----------+ - | firstname | employer | - |-----------+----------| - | Amber | Pyrami | - +-----------+----------+ - -**Combining Wildcards with Field Comparisons**:: - - os> search firstname=A* AND age>30 source=accounts | fields firstname, age, city; - fetched rows / total rows = 1/1 - +-----------+-----+--------+ - | firstname | age | city | - |-----------+-----+--------| - | Amber | 32 | Brogan | - +-----------+-----+--------+ - -**Important Notes on Wildcard Usage**: - -* **Keyword fields**: Best for wildcard searches - exact value matching with pattern support -* **Text fields**: Wildcards apply to individual tokens after analysis, not the entire field value -* **Performance**: Leading wildcards (e.g., ``*@example.com``) are slower than trailing wildcards -* **Case sensitivity**: Keyword field wildcards are case-sensitive unless normalized during indexing - -Example 7: IN Operator and Field Comparisons -============================================ - -The IN operator efficiently checks if a field matches any value from a list. This is cleaner and more performant than chaining multiple OR conditions for the same field. - -**IN Operator**:: - - os> search severityText IN ("ERROR", "WARN", "FATAL") source=otellogs | sort @timestamp | fields severityText | head 3; - fetched rows / total rows = 3/3 - +--------------+ - | severityText | - |--------------| - | ERROR | - | WARN | - | FATAL | - +--------------+ - -**Field Comparison Examples**:: - - os> search severityNumber=17 source=otellogs | sort @timestamp | fields body | head 1; - fetched rows / total rows = 1/1 - +---------------------------------------------------------+ - | body | - |---------------------------------------------------------| - | Payment failed: Insufficient funds for user@example.com | - +---------------------------------------------------------+ - - os> search `attributes.user.email`="user@example.com" source=otellogs | fields body; - fetched rows / total rows = 1/1 - +---------------------------------------------------------+ - | body | - |---------------------------------------------------------| - | Payment failed: Insufficient funds for user@example.com | - +---------------------------------------------------------+ - -Example 8: Complex Expressions -============================== - -Combine multiple conditions using boolean operators and parentheses to create sophisticated search queries. - -:: - - os> search (severityText="ERROR" OR severityText="WARN") AND severityNumber>10 source=otellogs | sort @timestamp | fields severityText | head 3; - fetched rows / total rows = 3/3 - +--------------+ - | severityText | - |--------------| - | ERROR | - | WARN | - | ERROR | - +--------------+ - - os> search `attributes.user.email`="user@example.com" OR (`attributes.error.code`="INSUFFICIENT_FUNDS" AND severityNumber>15) source=otellogs | fields body; - fetched rows / total rows = 1/1 - +---------------------------------------------------------+ - | body | - |---------------------------------------------------------| - | Payment failed: Insufficient funds for user@example.com | - +---------------------------------------------------------+ - -Example 9: Time Modifiers -========================= - -Time modifiers filter search results by time range using the implicit ``@timestamp`` field. They support various time formats for precise temporal filtering. - -**Absolute Time Filtering**:: - - os> search earliest='2024-01-15 10:30:05' latest='2024-01-15 10:30:10' source=otellogs | fields @timestamp, severityText; - fetched rows / total rows = 6/6 - +-------------------------------+--------------+ - | @timestamp | severityText | - |-------------------------------+--------------| - | 2024-01-15 10:30:05.678901234 | FATAL | - | 2024-01-15 10:30:06.789012345 | TRACE | - | 2024-01-15 10:30:07.890123456 | ERROR | - | 2024-01-15 10:30:08.901234567 | WARN | - | 2024-01-15 10:30:09.012345678 | INFO | - | 2024-01-15 10:30:10.123456789 | TRACE2 | - +-------------------------------+--------------+ - -**Relative Time Filtering** (before 30 seconds ago):: - - os> search latest=-30s source=otellogs | sort @timestamp | fields @timestamp, severityText | head 3; - fetched rows / total rows = 3/3 - +-------------------------------+--------------+ - | @timestamp | severityText | - |-------------------------------+--------------| - | 2024-01-15 10:30:00.123456789 | INFO | - | 2024-01-15 10:30:01.23456789 | ERROR | - | 2024-01-15 10:30:02.345678901 | WARN | - +-------------------------------+--------------+ - -**Time Snapping** (before start of current minute):: - - os> search latest='@m' source=otellogs | fields @timestamp, severityText | head 2; - fetched rows / total rows = 2/2 - +-------------------------------+--------------+ - | @timestamp | severityText | - |-------------------------------+--------------| - | 2024-01-15 10:30:00.123456789 | INFO | - | 2024-01-15 10:30:01.23456789 | ERROR | - +-------------------------------+--------------+ - -**Unix Timestamp Filtering**:: - - os> search earliest=1705314600 latest=1705314605 source=otellogs | fields @timestamp, severityText; - fetched rows / total rows = 5/5 - +-------------------------------+--------------+ - | @timestamp | severityText | - |-------------------------------+--------------| - | 2024-01-15 10:30:00.123456789 | INFO | - | 2024-01-15 10:30:01.23456789 | ERROR | - | 2024-01-15 10:30:02.345678901 | WARN | - | 2024-01-15 10:30:03.456789012 | DEBUG | - | 2024-01-15 10:30:04.567890123 | INFO | - +-------------------------------+--------------+ - -Example 10: Special Characters and Escaping -=========================================== - -Understand when and how to escape special characters in your search queries. There are two categories of characters that need escaping: - -**Characters that must be escaped**: -* **Backslashes (\)**: Always escape as ``\\`` to search for literal backslash -* **Quotes (")**: Escape as ``\"`` when inside quoted strings - -**Wildcard characters (escape only to search literally)**: -* **Asterisk (*)**: Use as-is for wildcard, escape as ``\\*`` to search for literal asterisk -* **Question mark (?)**: Use as-is for wildcard, escape as ``\\?`` to search for literal question mark - -.. list-table:: Wildcard vs Literal Search - :widths: 25 35 40 - :header-rows: 1 - - * - Intent - - PPL Syntax - - Result - * - Wildcard search - - ``field=user*`` - - Matches "user", "user123", "userABC" - * - Literal "user*" - - ``field="user\\*"`` - - Matches only "user*" - * - Wildcard search - - ``field=log?`` - - Matches "log1", "logA", "logs" - * - Literal "log?" - - ``field="log\\?"`` - - Matches only "log?" - -**Backslash in file paths**:: - - os> search `attributes.error.type`="C:\\\\Users\\\\admin" source=otellogs | fields `attributes.error.type`; - fetched rows / total rows = 1/1 - +-----------------------+ - | attributes.error.type | - |-----------------------| - | C:\Users\admin | - +-----------------------+ - -Note: Each backslash in the search value needs to be escaped with another backslash. When using REST API with JSON, additional JSON escaping is required. - -**Quotes within strings**:: - - os> search body="\"exact phrase\"" source=otellogs | sort @timestamp | fields body | head 1; - fetched rows / total rows = 1/1 - +--------------------------------------------------------------------------------------------------------------------------------------------------------+ - | body | - |--------------------------------------------------------------------------------------------------------------------------------------------------------| - | Query contains Lucene special characters: +field:value -excluded AND (grouped OR terms) NOT "exact phrase" wildcard* fuzzy~2 /regex/ [range TO search] | - +--------------------------------------------------------------------------------------------------------------------------------------------------------+ - -**Text with special characters**:: - - os> search "wildcard\\* fuzzy~2" source=otellogs | fields body | head 1; - fetched rows / total rows = 1/1 - +--------------------------------------------------------------------------------------------------------------------------------------------------------+ - | body | - |--------------------------------------------------------------------------------------------------------------------------------------------------------| - | Query contains Lucene special characters: +field:value -excluded AND (grouped OR terms) NOT "exact phrase" wildcard* fuzzy~2 /regex/ [range TO search] | - +--------------------------------------------------------------------------------------------------------------------------------------------------------+ - -Example 11: Fetch All Data -========================== - -Retrieve all documents from an index by specifying only the source without any search conditions. This is useful for exploring small datasets or verifying data ingestion. - -:: - - os> source=accounts; - fetched rows / total rows = 4/4 - +----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+ - | account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | - |----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------| - | 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | - | 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | - | 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates | - | 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | - +----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+ \ No newline at end of file diff --git a/docs/user/ppl/cmd/showdatasources.md b/docs/user/ppl/cmd/showdatasources.md new file mode 100644 index 00000000000..10129873aa6 --- /dev/null +++ b/docs/user/ppl/cmd/showdatasources.md @@ -0,0 +1,32 @@ +# show datasources + +## Description + +Use the `show datasources` command to query datasources configured in the PPL engine. The `show datasources` command can only be used as the first command in the PPL query. +## Syntax + +show datasources +## Example 1: Fetch all PROMETHEUS datasources + +This example shows fetching all the datasources of type prometheus. +PPL query for all PROMETHEUS DATASOURCES + +```ppl +show datasources +| where CONNECTOR_TYPE='PROMETHEUS' +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------+----------------+ +| DATASOURCE_NAME | CONNECTOR_TYPE | +|-----------------+----------------| +| my_prometheus | PROMETHEUS | ++-----------------+----------------+ +``` + +## Limitations + +The `show datasources` command can only work with `plugins.calcite.enabled=false`. \ No newline at end of file diff --git a/docs/user/ppl/cmd/showdatasources.rst b/docs/user/ppl/cmd/showdatasources.rst deleted file mode 100644 index 9d0794bb3aa..00000000000 --- a/docs/user/ppl/cmd/showdatasources.rst +++ /dev/null @@ -1,38 +0,0 @@ -================ -show datasources -================ - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| Use the ``show datasources`` command to query datasources configured in the PPL engine. The ``show datasources`` command can only be used as the first command in the PPL query. - -Syntax -====== -show datasources - -Example 1: Fetch all PROMETHEUS datasources -=========================================== - -This example shows fetching all the datasources of type prometheus. - -PPL query for all PROMETHEUS DATASOURCES:: - - os> show datasources | where CONNECTOR_TYPE='PROMETHEUS'; - fetched rows / total rows = 1/1 - +-----------------+----------------+ - | DATASOURCE_NAME | CONNECTOR_TYPE | - |-----------------+----------------| - | my_prometheus | PROMETHEUS | - +-----------------+----------------+ - - -Limitations -=========== -The ``show datasources`` command can only work with ``plugins.calcite.enabled=false``. diff --git a/docs/user/ppl/cmd/sort.md b/docs/user/ppl/cmd/sort.md new file mode 100644 index 00000000000..a6e5ba1c0ea --- /dev/null +++ b/docs/user/ppl/cmd/sort.md @@ -0,0 +1,256 @@ +# sort + +## Description + +The `sort` command sorts all the search results by the specified fields. +## Syntax + +sort [count] <[+\|-] sort-field \| sort-field [asc\|a\|desc\|d]>... +* count: optional. The number of results to return. Specifying a count of 0 or less than 0 returns all results. **Default:** 0. +* [+\|-]: optional. The plus [+] stands for ascending order and NULL/MISSING first and a minus [-] stands for descending order and NULL/MISSING last. **Default:** ascending order and NULL/MISSING first. +* [asc\|a\|desc\|d]: optional. asc/a stands for ascending order and NULL/MISSING first. desc/d stands for descending order and NULL/MISSING last. **Default:** ascending order and NULL/MISSING first. +* sort-field: mandatory. The field used to sort. Can use `auto(field)`, `str(field)`, `ip(field)`, or `num(field)` to specify how to interpret field values. + +> **Note:** +> You cannot mix +/- and asc/desc in the same sort command. Choose one approach for all fields in a single sort command. +> +> + +## Example 1: Sort by one field + +This example shows sorting all documents by age field in ascending order. + +```ppl +source=accounts +| sort age +| fields account_number, age +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+-----+ +| account_number | age | +|----------------+-----| +| 13 | 28 | +| 1 | 32 | +| 18 | 33 | +| 6 | 36 | ++----------------+-----+ +``` + +## Example 2: Sort by one field return all the result + +This example shows sorting all documents by age field in ascending order and returning all results. + +```ppl +source=accounts +| sort 0 age +| fields account_number, age +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+-----+ +| account_number | age | +|----------------+-----| +| 13 | 28 | +| 1 | 32 | +| 18 | 33 | +| 6 | 36 | ++----------------+-----+ +``` + +## Example 3: Sort by one field in descending order (using -) + +This example shows sorting all documents by age field in descending order. + +```ppl +source=accounts +| sort - age +| fields account_number, age +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+-----+ +| account_number | age | +|----------------+-----| +| 6 | 36 | +| 18 | 33 | +| 1 | 32 | +| 13 | 28 | ++----------------+-----+ +``` + +## Example 4: Sort by one field in descending order (using desc) + +This example shows sorting all the document by the age field in descending order using the desc keyword. + +```ppl +source=accounts +| sort age desc +| fields account_number, age +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+-----+ +| account_number | age | +|----------------+-----| +| 6 | 36 | +| 18 | 33 | +| 1 | 32 | +| 13 | 28 | ++----------------+-----+ +``` + +## Example 5: Sort by multiple fields (using +/-) + +This example shows sorting all documents by gender field in ascending order and age field in descending order using +/- operators. + +```ppl +source=accounts +| sort + gender, - age +| fields account_number, gender, age +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+--------+-----+ +| account_number | gender | age | +|----------------+--------+-----| +| 13 | F | 28 | +| 6 | M | 36 | +| 18 | M | 33 | +| 1 | M | 32 | ++----------------+--------+-----+ +``` + +## Example 6: Sort by multiple fields (using asc/desc) + +This example shows sorting all the document by the gender field in ascending order and age field in descending order using asc/desc keywords. + +```ppl +source=accounts +| sort gender asc, age desc +| fields account_number, gender, age +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+--------+-----+ +| account_number | gender | age | +|----------------+--------+-----| +| 13 | F | 28 | +| 6 | M | 36 | +| 18 | M | 33 | +| 1 | M | 32 | ++----------------+--------+-----+ +``` + +## Example 7: Sort by field include null value + +This example shows sorting employer field by default option (ascending order and null first). The result shows that null value is in the first row. + +```ppl +source=accounts +| sort employer +| fields employer +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------+ +| employer | +|----------| +| null | +| Netagy | +| Pyrami | +| Quility | ++----------+ +``` + +## Example 8: Specify the number of sorted documents to return + +This example shows sorting all documents and returning 2 documents. + +```ppl +source=accounts +| sort 2 age +| fields account_number, age +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++----------------+-----+ +| account_number | age | +|----------------+-----| +| 13 | 28 | +| 1 | 32 | ++----------------+-----+ +``` + +## Example 9: Sort with desc modifier + +This example shows sorting with the desc modifier to reverse sort order. + +```ppl +source=accounts +| sort age desc +| fields account_number, age +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+-----+ +| account_number | age | +|----------------+-----| +| 6 | 36 | +| 18 | 33 | +| 1 | 32 | +| 13 | 28 | ++----------------+-----+ +``` + +## Example 10: Sort with specifying field type + +This example shows sorting with str() to sort numeric values lexicographically. + +```ppl +source=accounts +| sort str(account_number) +| fields account_number +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+ +| account_number | +|----------------| +| 1 | +| 13 | +| 18 | +| 6 | ++----------------+ +``` + \ No newline at end of file diff --git a/docs/user/ppl/cmd/sort.rst b/docs/user/ppl/cmd/sort.rst deleted file mode 100644 index 929a2b313b4..00000000000 --- a/docs/user/ppl/cmd/sort.rst +++ /dev/null @@ -1,208 +0,0 @@ -==== -sort -==== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| The ``sort`` command sorts all the search results by the specified fields. - -Syntax -============ -sort [count] <[+|-] sort-field | sort-field [asc|a|desc|d]>... - - -* count: optional. The number of results to return. Specifying a count of 0 or less than 0 returns all results. **Default:** 0. -* [+|-]: optional. The plus [+] stands for ascending order and NULL/MISSING first and a minus [-] stands for descending order and NULL/MISSING last. **Default:** ascending order and NULL/MISSING first. -* [asc|a|desc|d]: optional. asc/a stands for ascending order and NULL/MISSING first. desc/d stands for descending order and NULL/MISSING last. **Default:** ascending order and NULL/MISSING first. -* sort-field: mandatory. The field used to sort. Can use ``auto(field)``, ``str(field)``, ``ip(field)``, or ``num(field)`` to specify how to interpret field values. - -.. note:: - You cannot mix +/- and asc/desc in the same sort command. Choose one approach for all fields in a single sort command. - - -Example 1: Sort by one field -============================ - -This example shows sorting all documents by age field in ascending order. - -PPL query:: - - os> source=accounts | sort age | fields account_number, age; - fetched rows / total rows = 4/4 - +----------------+-----+ - | account_number | age | - |----------------+-----| - | 13 | 28 | - | 1 | 32 | - | 18 | 33 | - | 6 | 36 | - +----------------+-----+ - - -Example 2: Sort by one field return all the result -================================================== - -This example shows sorting all documents by age field in ascending order and returning all results. - -PPL query:: - - os> source=accounts | sort 0 age | fields account_number, age; - fetched rows / total rows = 4/4 - +----------------+-----+ - | account_number | age | - |----------------+-----| - | 13 | 28 | - | 1 | 32 | - | 18 | 33 | - | 6 | 36 | - +----------------+-----+ - - -Example 3: Sort by one field in descending order (using -) -========================================================== - -This example shows sorting all documents by age field in descending order. - -PPL query:: - - os> source=accounts | sort - age | fields account_number, age; - fetched rows / total rows = 4/4 - +----------------+-----+ - | account_number | age | - |----------------+-----| - | 6 | 36 | - | 18 | 33 | - | 1 | 32 | - | 13 | 28 | - +----------------+-----+ - -Example 4: Sort by one field in descending order (using desc) -============================================================== - -This example shows sorting all the document by the age field in descending order using the desc keyword. - -PPL query:: - - os> source=accounts | sort age desc | fields account_number, age; - fetched rows / total rows = 4/4 - +----------------+-----+ - | account_number | age | - |----------------+-----| - | 6 | 36 | - | 18 | 33 | - | 1 | 32 | - | 13 | 28 | - +----------------+-----+ - -Example 5: Sort by multiple fields (using +/-) -============================================== - -This example shows sorting all documents by gender field in ascending order and age field in descending order using +/- operators. - -PPL query:: - - os> source=accounts | sort + gender, - age | fields account_number, gender, age; - fetched rows / total rows = 4/4 - +----------------+--------+-----+ - | account_number | gender | age | - |----------------+--------+-----| - | 13 | F | 28 | - | 6 | M | 36 | - | 18 | M | 33 | - | 1 | M | 32 | - +----------------+--------+-----+ - -Example 6: Sort by multiple fields (using asc/desc) -==================================================== - -This example shows sorting all the document by the gender field in ascending order and age field in descending order using asc/desc keywords. - -PPL query:: - - os> source=accounts | sort gender asc, age desc | fields account_number, gender, age; - fetched rows / total rows = 4/4 - +----------------+--------+-----+ - | account_number | gender | age | - |----------------+--------+-----| - | 13 | F | 28 | - | 6 | M | 36 | - | 18 | M | 33 | - | 1 | M | 32 | - +----------------+--------+-----+ - -Example 7: Sort by field include null value -=========================================== - -This example shows sorting employer field by default option (ascending order and null first). The result shows that null value is in the first row. - -PPL query:: - - os> source=accounts | sort employer | fields employer; - fetched rows / total rows = 4/4 - +----------+ - | employer | - |----------| - | null | - | Netagy | - | Pyrami | - | Quility | - +----------+ - -Example 8: Specify the number of sorted documents to return -============================================================ - -This example shows sorting all documents and returning 2 documents. - -PPL query:: - - os> source=accounts | sort 2 age | fields account_number, age; - fetched rows / total rows = 2/2 - +----------------+-----+ - | account_number | age | - |----------------+-----| - | 13 | 28 | - | 1 | 32 | - +----------------+-----+ - -Example 9: Sort with desc modifier -=================================== - -This example shows sorting with the desc modifier to reverse sort order. - -PPL query:: - - os> source=accounts | sort age desc | fields account_number, age; - fetched rows / total rows = 4/4 - +----------------+-----+ - | account_number | age | - |----------------+-----| - | 6 | 36 | - | 18 | 33 | - | 1 | 32 | - | 13 | 28 | - +----------------+-----+ - -Example 10: Sort with specifying field type -================================== - -This example shows sorting with str() to sort numeric values lexicographically. - -PPL query:: - - os> source=accounts | sort str(account_number) | fields account_number; - fetched rows / total rows = 4/4 - +----------------+ - | account_number | - |----------------| - | 1 | - | 13 | - | 18 | - | 6 | - +----------------+ \ No newline at end of file diff --git a/docs/user/ppl/cmd/spath.md b/docs/user/ppl/cmd/spath.md new file mode 100644 index 00000000000..c83afc3a31c --- /dev/null +++ b/docs/user/ppl/cmd/spath.md @@ -0,0 +1,110 @@ +# spath + +## Description + +The `spath` command allows extracting fields from structured text data. It currently allows selecting from JSON data with JSON paths. +## Syntax + +spath input=\ [output=\] [path=]\ +* input: mandatory. The field to scan for JSON data. +* output: optional. The destination field that the data will be loaded to. **Default:** value of `path`. +* path: mandatory. The path of the data to load for the object. For more information on path syntax, see [json_extract](../functions/json.md#json_extract). + +## Note + +The `spath` command currently does not support pushdown behavior for extraction. It will be slow on large datasets. It's generally better to index fields needed for filtering directly instead of using `spath` to filter nested fields. +## Example 1: Simple Field Extraction + +The simplest spath is to extract a single field. This example extracts `n` from the `doc` field of type `text`. + +```ppl +source=structured +| spath input=doc_n n +| fields doc_n n +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++----------+---+ +| doc_n | n | +|----------+---| +| {"n": 1} | 1 | +| {"n": 2} | 2 | +| {"n": 3} | 3 | ++----------+---+ +``` + +## Example 2: Lists & Nesting + +This example demonstrates more JSON path uses, like traversing nested fields and extracting list elements. + +```ppl +source=structured +| spath input=doc_list output=first_element list{0} +| spath input=doc_list output=all_elements list{} +| spath input=doc_list output=nested nest_out.nest_in +| fields doc_list first_element all_elements nested +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++------------------------------------------------------+---------------+--------------+--------+ +| doc_list | first_element | all_elements | nested | +|------------------------------------------------------+---------------+--------------+--------| +| {"list": [1, 2, 3, 4], "nest_out": {"nest_in": "a"}} | 1 | [1,2,3,4] | a | +| {"list": [], "nest_out": {"nest_in": "a"}} | null | [] | a | +| {"list": [5, 6], "nest_out": {"nest_in": "a"}} | 5 | [5,6] | a | ++------------------------------------------------------+---------------+--------------+--------+ +``` + +## Example 3: Sum of inner elements + +This example shows extracting an inner field and doing statistics on it, using the docs from example 1. It also demonstrates that `spath` always returns strings for inner types. + +```ppl +source=structured +| spath input=doc_n n +| eval n=cast(n as int) +| stats sum(n) +| fields `sum(n)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------+ +| sum(n) | +|--------| +| 6 | ++--------+ +``` + +## Example 4: Escaped paths + +`spath` can escape paths with strings to accept any path that `json_extract` does. This includes escaping complex field names as array components. + +```ppl +source=structured +| spath output=a input=doc_escape "['a fancy field name']" +| spath output=b input=doc_escape "['a.b.c']" +| fields a b +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++-------+---+ +| a | b | +|-------+---| +| true | 0 | +| true | 1 | +| false | 2 | ++-------+---+ +``` + \ No newline at end of file diff --git a/docs/user/ppl/cmd/spath.rst b/docs/user/ppl/cmd/spath.rst deleted file mode 100644 index f7a9d034132..00000000000 --- a/docs/user/ppl/cmd/spath.rst +++ /dev/null @@ -1,92 +0,0 @@ -===== -spath -===== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| The `spath` command allows extracting fields from structured text data. It currently allows selecting from JSON data with JSON paths. - -Syntax -====== -spath input= [output=] [path=] - -* input: mandatory. The field to scan for JSON data. -* output: optional. The destination field that the data will be loaded to. **Default:** value of `path`. -* path: mandatory. The path of the data to load for the object. For more information on path syntax, see `json_extract <../functions/json.rst#json_extract>`_. - -Note -===== -The `spath` command currently does not support pushdown behavior for extraction. It will be slow on large datasets. It's generally better to index fields needed for filtering directly instead of using `spath` to filter nested fields. - -Example 1: Simple Field Extraction -================================== - -The simplest spath is to extract a single field. This example extracts `n` from the `doc` field of type `text`. - -PPL query:: - - os> source=structured | spath input=doc_n n | fields doc_n n; - fetched rows / total rows = 3/3 - +----------+---+ - | doc_n | n | - |----------+---| - | {"n": 1} | 1 | - | {"n": 2} | 2 | - | {"n": 3} | 3 | - +----------+---+ - -Example 2: Lists & Nesting -========================== - -This example demonstrates more JSON path uses, like traversing nested fields and extracting list elements. - -PPL query:: - - os> source=structured | spath input=doc_list output=first_element list{0} | spath input=doc_list output=all_elements list{} | spath input=doc_list output=nested nest_out.nest_in | fields doc_list first_element all_elements nested; - fetched rows / total rows = 3/3 - +------------------------------------------------------+---------------+--------------+--------+ - | doc_list | first_element | all_elements | nested | - |------------------------------------------------------+---------------+--------------+--------| - | {"list": [1, 2, 3, 4], "nest_out": {"nest_in": "a"}} | 1 | [1,2,3,4] | a | - | {"list": [], "nest_out": {"nest_in": "a"}} | null | [] | a | - | {"list": [5, 6], "nest_out": {"nest_in": "a"}} | 5 | [5,6] | a | - +------------------------------------------------------+---------------+--------------+--------+ - -Example 3: Sum of inner elements -================================ - -This example shows extracting an inner field and doing statistics on it, using the docs from example 1. It also demonstrates that `spath` always returns strings for inner types. - -PPL query:: - - os> source=structured | spath input=doc_n n | eval n=cast(n as int) | stats sum(n) | fields `sum(n)`; - fetched rows / total rows = 1/1 - +--------+ - | sum(n) | - |--------| - | 6 | - +--------+ - -Example 4: Escaped paths -============================ - -`spath` can escape paths with strings to accept any path that `json_extract` does. This includes escaping complex field names as array components. - -PPL query:: - - os> source=structured | spath output=a input=doc_escape "['a fancy field name']" | spath output=b input=doc_escape "['a.b.c']" | fields a b; - fetched rows / total rows = 3/3 - +-------+---+ - | a | b | - |-------+---| - | true | 0 | - | true | 1 | - | false | 2 | - +-------+---+ diff --git a/docs/user/ppl/cmd/stats.md b/docs/user/ppl/cmd/stats.md new file mode 100644 index 00000000000..5d805b6b723 --- /dev/null +++ b/docs/user/ppl/cmd/stats.md @@ -0,0 +1,487 @@ +# stats + +## Description + +The `stats` command calculates the aggregation from the search result. +## Syntax + +stats [bucket_nullable=bool] \... [by-clause] +* aggregation: mandatory. An aggregation function. +* bucket_nullable: optional. Controls whether the stats command includes null buckets in group-by aggregations. When set to `false`, the aggregation ignores records where the group-by field is null, resulting in faster performance by excluding null bucket. **Default:** Determined by `plugins.ppl.syntax.legacy.preferred`. + * When `plugins.ppl.syntax.legacy.preferred=true`, `bucket_nullable` defaults to `true` + * When `plugins.ppl.syntax.legacy.preferred=false`, `bucket_nullable` defaults to `false` +* by-clause: optional. Groups results by specified fields or expressions. Syntax: by [span-expression,] [field,]... **Default:** If no by-clause is specified, the stats command returns only one row, which is the aggregation over the entire result set. +* span-expression: optional, at most one. Splits field into buckets by intervals. Syntax: span(field_expr, interval_expr). The unit of the interval expression is the natural unit by default. If the field is a date/time type field, the aggregation results always ignore null bucket. For example, `span(age, 10)` creates 10-year age buckets, `span(timestamp, 1h)` creates hourly buckets. + * Available time units + * millisecond (ms) + * second (s) + * minute (m, case sensitive) + * hour (h) + * day (d) + * week (w) + * month (M, case sensitive) + * quarter (q) + * year (y) + +## Aggregation Functions + +The stats command supports the following aggregation functions: +* COUNT/C: Count of values +* SUM: Sum of numeric values +* AVG: Average of numeric values +* MAX: Maximum value +* MIN: Minimum value +* VAR_SAMP: Sample variance +* VAR_POP: Population variance +* STDDEV_SAMP: Sample standard deviation +* STDDEV_POP: Population standard deviation +* DISTINCT_COUNT_APPROX: Approximate distinct count +* TAKE: List of original values +* PERCENTILE/PERCENTILE_APPROX: Percentile calculations +* PERC\/P\: Percentile shortcut functions +* MEDIAN: 50th percentile +* EARLIEST: Earliest value by timestamp +* LATEST: Latest value by timestamp +* FIRST: First non-null value +* LAST: Last non-null value +* LIST: Collect all values into array +* VALUES: Collect unique values into sorted array + +For detailed documentation of each function, see [Aggregation Functions](../functions/aggregations.md). +## Example 1: Calculate the count of events + +This example shows calculating the count of events in the accounts. + +```ppl +source=accounts +| stats count() +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------+ +| count() | +|---------| +| 4 | ++---------+ +``` + +## Example 2: Calculate the average of a field + +This example shows calculating the average age of all the accounts. + +```ppl +source=accounts +| stats avg(age) +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------+ +| avg(age) | +|----------| +| 32.25 | ++----------+ +``` + +## Example 3: Calculate the average of a field by group + +This example shows calculating the average age of all the accounts group by gender. + +```ppl +source=accounts +| stats avg(age) by gender +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++--------------------+--------+ +| avg(age) | gender | +|--------------------+--------| +| 28.0 | F | +| 33.666666666666664 | M | ++--------------------+--------+ +``` + +## Example 4: Calculate the average, sum and count of a field by group + +This example shows calculating the average age, sum age and count of events of all the accounts group by gender. + +```ppl +source=accounts +| stats avg(age), sum(age), count() by gender +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++--------------------+----------+---------+--------+ +| avg(age) | sum(age) | count() | gender | +|--------------------+----------+---------+--------| +| 28.0 | 28 | 1 | F | +| 33.666666666666664 | 101 | 3 | M | ++--------------------+----------+---------+--------+ +``` + +## Example 5: Calculate the maximum of a field + +The example calculates the max age of all the accounts. + +```ppl +source=accounts +| stats max(age) +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------+ +| max(age) | +|----------| +| 36 | ++----------+ +``` + +## Example 6: Calculate the maximum and minimum of a field by group + +The example calculates the max and min age values of all the accounts group by gender. + +```ppl +source=accounts +| stats max(age), min(age) by gender +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++----------+----------+--------+ +| max(age) | min(age) | gender | +|----------+----------+--------| +| 28 | 28 | F | +| 36 | 32 | M | ++----------+----------+--------+ +``` + +## Example 7: Calculate the distinct count of a field + +To get the count of distinct values of a field, you can use `DISTINCT_COUNT` (or `DC`) function instead of `COUNT`. The example calculates both the count and the distinct count of gender field of all the accounts. + +```ppl +source=accounts +| stats count(gender), distinct_count(gender) +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------+------------------------+ +| count(gender) | distinct_count(gender) | +|---------------+------------------------| +| 4 | 2 | ++---------------+------------------------+ +``` + +## Example 8: Calculate the count by a span + +The example gets the count of age by the interval of 10 years. + +```ppl +source=accounts +| stats count(age) by span(age, 10) as age_span +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++------------+----------+ +| count(age) | age_span | +|------------+----------| +| 1 | 20 | +| 3 | 30 | ++------------+----------+ +``` + +## Example 9: Calculate the count by a gender and span + +The example gets the count of age by the interval of 10 years and group by gender. + +```ppl +source=accounts +| stats count() as cnt by span(age, 5) as age_span, gender +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++-----+----------+--------+ +| cnt | age_span | gender | +|-----+----------+--------| +| 1 | 25 | F | +| 2 | 30 | M | +| 1 | 35 | M | ++-----+----------+--------+ +``` + +Span will always be the first grouping key whatever order you specify. + +```ppl +source=accounts +| stats count() as cnt by gender, span(age, 5) as age_span +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++-----+----------+--------+ +| cnt | age_span | gender | +|-----+----------+--------| +| 1 | 25 | F | +| 2 | 30 | M | +| 1 | 35 | M | ++-----+----------+--------+ +``` + +## Example 10: Calculate the count and get email list by a gender and span + +The example gets the count of age by the interval of 10 years and group by gender, additionally for each row get a list of at most 5 emails. + +```ppl +source=accounts +| stats count() as cnt, take(email, 5) by span(age, 5) as age_span, gender +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++-----+--------------------------------------------+----------+--------+ +| cnt | take(email, 5) | age_span | gender | +|-----+--------------------------------------------+----------+--------| +| 1 | [] | 25 | F | +| 2 | [amberduke@pyrami.com,daleadams@boink.com] | 30 | M | +| 1 | [hattiebond@netagy.com] | 35 | M | ++-----+--------------------------------------------+----------+--------+ +``` + +## Example 11: Calculate the percentile of a field + +This example shows calculating the percentile 90th age of all the accounts. + +```ppl +source=accounts +| stats percentile(age, 90) +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------------+ +| percentile(age, 90) | +|---------------------| +| 36 | ++---------------------+ +``` + +## Example 12: Calculate the percentile of a field by group + +This example shows calculating the percentile 90th age of all the accounts group by gender. + +```ppl +source=accounts +| stats percentile(age, 90) by gender +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++---------------------+--------+ +| percentile(age, 90) | gender | +|---------------------+--------| +| 28 | F | +| 36 | M | ++---------------------+--------+ +``` + +## Example 13: Calculate the percentile by a gender and span + +The example gets the percentile 90th age by the interval of 10 years and group by gender. + +```ppl +source=accounts +| stats percentile(age, 90) as p90 by span(age, 10) as age_span, gender +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++-----+----------+--------+ +| p90 | age_span | gender | +|-----+----------+--------| +| 28 | 20 | F | +| 36 | 30 | M | ++-----+----------+--------+ +``` + +## Example 14: Collect all values in a field using LIST + +The example shows how to collect all firstname values, preserving duplicates and order. + +```ppl +source=accounts +| stats list(firstname) +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------------+ +| list(firstname) | +|-----------------------------| +| [Amber,Hattie,Nanette,Dale] | ++-----------------------------+ +``` + +## Example 15: Ignore null bucket + +```ppl +source=accounts +| stats bucket_nullable=false count() as cnt by email +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++-----+-----------------------+ +| cnt | email | +|-----+-----------------------| +| 1 | amberduke@pyrami.com | +| 1 | daleadams@boink.com | +| 1 | hattiebond@netagy.com | ++-----+-----------------------+ +``` + +## Example 16: Collect unique values in a field using VALUES + +The example shows how to collect all unique firstname values, sorted lexicographically with duplicates removed. + +```ppl +source=accounts +| stats values(firstname) +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------------+ +| values(firstname) | +|-----------------------------| +| [Amber,Dale,Hattie,Nanette] | ++-----------------------------+ +``` + +## Example 17: Span on date/time field always ignore null bucket + +Index example data: ++-------+--------+------------+ +Name | DEPTNO | birthday | ++=======+========+============+ +Alice | 1 | 2024-04-21 | ++-------+--------+------------+ +Bob | 2 | 2025-08-21 | ++-------+--------+------------+ +Jeff | null | 2025-04-22 | ++-------+--------+------------+ +Adam | 2 | null | ++-------+--------+------------+ + +```ppl ignore +source=example +| stats count() as cnt by span(birthday, 1y) as year +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++-----+------------+ +| cnt | year | +|-----+------------| +| 1 | 2024-01-01 | +| 2 | 2025-01-01 | ++-----+------------+ +``` + +```ppl ignore +source=example +| stats count() as cnt by span(birthday, 1y) as year, DEPTNO +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++-----+------------+--------+ +| cnt | year | DEPTNO | +|-----+------------+--------| +| 1 | 2024-01-01 | 1 | +| 1 | 2025-01-01 | 2 | +| 1 | 2025-01-01 | null | ++-----+------------+--------+ +``` + +```ppl ignore +source=example +| stats bucket_nullable=false count() as cnt by span(birthday, 1y) as year, DEPTNO +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++-----+------------+--------+ +| cnt | year | DEPTNO | +|-----+------------+--------| +| 1 | 2024-01-01 | 1 | +| 1 | 2025-01-01 | 2 | ++-----+------------+--------+ +``` + +## Example 18: Calculate the count by the implicit @timestamp field + +This example demonstrates that if you omit the field parameter in the span function, it will automatically use the implicit `@timestamp` field. + +```ppl ignore +source=big5 +| stats count() by span(1month) +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------+---------------------+ +| count() | span(1month) | +|---------+---------------------| +| 1 | 2023-01-01 00:00:00 | ++---------+---------------------+ +``` + \ No newline at end of file diff --git a/docs/user/ppl/cmd/stats.rst b/docs/user/ppl/cmd/stats.rst deleted file mode 100644 index cae65c84c79..00000000000 --- a/docs/user/ppl/cmd/stats.rst +++ /dev/null @@ -1,409 +0,0 @@ -===== -stats -===== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| The ``stats`` command calculates the aggregation from the search result. - - - -Syntax -====== -stats [bucket_nullable=bool] ... [by-clause] - -* aggregation: mandatory. An aggregation function. -* bucket_nullable: optional. Controls whether the stats command includes null buckets in group-by aggregations. When set to ``false``, the aggregation ignores records where the group-by field is null, resulting in faster performance by excluding null bucket. **Default:** Determined by ``plugins.ppl.syntax.legacy.preferred``. - - * When ``plugins.ppl.syntax.legacy.preferred=true``, ``bucket_nullable`` defaults to ``true`` - * When ``plugins.ppl.syntax.legacy.preferred=false``, ``bucket_nullable`` defaults to ``false`` - -* by-clause: optional. Groups results by specified fields or expressions. Syntax: by [span-expression,] [field,]... **Default:** If no by-clause is specified, the stats command returns only one row, which is the aggregation over the entire result set. -* span-expression: optional, at most one. Splits field into buckets by intervals. Syntax: span(field_expr, interval_expr). The unit of the interval expression is the natural unit by default. If the field is a date/time type field, the aggregation results always ignore null bucket. For example, ``span(age, 10)`` creates 10-year age buckets, ``span(timestamp, 1h)`` creates hourly buckets. - - * Available time units - - * millisecond (ms) - * second (s) - * minute (m, case sensitive) - * hour (h) - * day (d) - * week (w) - * month (M, case sensitive) - * quarter (q) - * year (y) - -Aggregation Functions -===================== - -The stats command supports the following aggregation functions: - -* COUNT/C: Count of values -* SUM: Sum of numeric values -* AVG: Average of numeric values -* MAX: Maximum value -* MIN: Minimum value -* VAR_SAMP: Sample variance -* VAR_POP: Population variance -* STDDEV_SAMP: Sample standard deviation -* STDDEV_POP: Population standard deviation -* DISTINCT_COUNT_APPROX: Approximate distinct count -* TAKE: List of original values -* PERCENTILE/PERCENTILE_APPROX: Percentile calculations -* PERC/P: Percentile shortcut functions -* MEDIAN: 50th percentile -* EARLIEST: Earliest value by timestamp -* LATEST: Latest value by timestamp -* FIRST: First non-null value -* LAST: Last non-null value -* LIST: Collect all values into array -* VALUES: Collect unique values into sorted array - -For detailed documentation of each function, see `Aggregation Functions <../functions/aggregations.rst>`_. - -Example 1: Calculate the count of events -======================================== - -This example shows calculating the count of events in the accounts. - -PPL query:: - - os> source=accounts | stats count(); - fetched rows / total rows = 1/1 - +---------+ - | count() | - |---------| - | 4 | - +---------+ - - -Example 2: Calculate the average of a field -=========================================== - -This example shows calculating the average age of all the accounts. - -PPL query:: - - os> source=accounts | stats avg(age); - fetched rows / total rows = 1/1 - +----------+ - | avg(age) | - |----------| - | 32.25 | - +----------+ - - -Example 3: Calculate the average of a field by group -==================================================== - -This example shows calculating the average age of all the accounts group by gender. - -PPL query:: - - os> source=accounts | stats avg(age) by gender; - fetched rows / total rows = 2/2 - +--------------------+--------+ - | avg(age) | gender | - |--------------------+--------| - | 28.0 | F | - | 33.666666666666664 | M | - +--------------------+--------+ - - -Example 4: Calculate the average, sum and count of a field by group -=================================================================== - -This example shows calculating the average age, sum age and count of events of all the accounts group by gender. - -PPL query:: - - os> source=accounts | stats avg(age), sum(age), count() by gender; - fetched rows / total rows = 2/2 - +--------------------+----------+---------+--------+ - | avg(age) | sum(age) | count() | gender | - |--------------------+----------+---------+--------| - | 28.0 | 28 | 1 | F | - | 33.666666666666664 | 101 | 3 | M | - +--------------------+----------+---------+--------+ - -Example 5: Calculate the maximum of a field -=========================================== - -The example calculates the max age of all the accounts. - -PPL query:: - - os> source=accounts | stats max(age); - fetched rows / total rows = 1/1 - +----------+ - | max(age) | - |----------| - | 36 | - +----------+ - -Example 6: Calculate the maximum and minimum of a field by group -================================================================ - -The example calculates the max and min age values of all the accounts group by gender. - -PPL query:: - - os> source=accounts | stats max(age), min(age) by gender; - fetched rows / total rows = 2/2 - +----------+----------+--------+ - | max(age) | min(age) | gender | - |----------+----------+--------| - | 28 | 28 | F | - | 36 | 32 | M | - +----------+----------+--------+ - -Example 7: Calculate the distinct count of a field -================================================== - -To get the count of distinct values of a field, you can use ``DISTINCT_COUNT`` (or ``DC``) function instead of ``COUNT``. The example calculates both the count and the distinct count of gender field of all the accounts. - -PPL query:: - - os> source=accounts | stats count(gender), distinct_count(gender); - fetched rows / total rows = 1/1 - +---------------+------------------------+ - | count(gender) | distinct_count(gender) | - |---------------+------------------------| - | 4 | 2 | - +---------------+------------------------+ - -Example 8: Calculate the count by a span -======================================== - -The example gets the count of age by the interval of 10 years. - -PPL query:: - - os> source=accounts | stats count(age) by span(age, 10) as age_span - fetched rows / total rows = 2/2 - +------------+----------+ - | count(age) | age_span | - |------------+----------| - | 1 | 20 | - | 3 | 30 | - +------------+----------+ - -Example 9: Calculate the count by a gender and span -=================================================== - -The example gets the count of age by the interval of 10 years and group by gender. - -PPL query:: - - os> source=accounts | stats count() as cnt by span(age, 5) as age_span, gender - fetched rows / total rows = 3/3 - +-----+----------+--------+ - | cnt | age_span | gender | - |-----+----------+--------| - | 1 | 25 | F | - | 2 | 30 | M | - | 1 | 35 | M | - +-----+----------+--------+ - -Span will always be the first grouping key whatever order you specify. - -PPL query:: - - os> source=accounts | stats count() as cnt by gender, span(age, 5) as age_span - fetched rows / total rows = 3/3 - +-----+----------+--------+ - | cnt | age_span | gender | - |-----+----------+--------| - | 1 | 25 | F | - | 2 | 30 | M | - | 1 | 35 | M | - +-----+----------+--------+ - -Example 10: Calculate the count and get email list by a gender and span -======================================================================= - -The example gets the count of age by the interval of 10 years and group by gender, additionally for each row get a list of at most 5 emails. - -PPL query:: - - os> source=accounts | stats count() as cnt, take(email, 5) by span(age, 5) as age_span, gender - fetched rows / total rows = 3/3 - +-----+--------------------------------------------+----------+--------+ - | cnt | take(email, 5) | age_span | gender | - |-----+--------------------------------------------+----------+--------| - | 1 | [] | 25 | F | - | 2 | [amberduke@pyrami.com,daleadams@boink.com] | 30 | M | - | 1 | [hattiebond@netagy.com] | 35 | M | - +-----+--------------------------------------------+----------+--------+ - -Example 11: Calculate the percentile of a field -=============================================== - -This example shows calculating the percentile 90th age of all the accounts. - -PPL query:: - - os> source=accounts | stats percentile(age, 90); - fetched rows / total rows = 1/1 - +---------------------+ - | percentile(age, 90) | - |---------------------| - | 36 | - +---------------------+ - - -Example 12: Calculate the percentile of a field by group -======================================================== - -This example shows calculating the percentile 90th age of all the accounts group by gender. - -PPL query:: - - os> source=accounts | stats percentile(age, 90) by gender; - fetched rows / total rows = 2/2 - +---------------------+--------+ - | percentile(age, 90) | gender | - |---------------------+--------| - | 28 | F | - | 36 | M | - +---------------------+--------+ - -Example 13: Calculate the percentile by a gender and span -========================================================= - -The example gets the percentile 90th age by the interval of 10 years and group by gender. - -PPL query:: - - os> source=accounts | stats percentile(age, 90) as p90 by span(age, 10) as age_span, gender - fetched rows / total rows = 2/2 - +-----+----------+--------+ - | p90 | age_span | gender | - |-----+----------+--------| - | 28 | 20 | F | - | 36 | 30 | M | - +-----+----------+--------+ - -Example 14: Collect all values in a field using LIST -==================================================== - -The example shows how to collect all firstname values, preserving duplicates and order. - -PPL query:: - - PPL> source=accounts | stats list(firstname); - fetched rows / total rows = 1/1 - +-------------------------------------+ - | list(firstname) | - |-------------------------------------| - | ["Amber","Hattie","Nanette","Dale"] | - +-------------------------------------+ - - -Example 15: Ignore null bucket -============================== - - -PPL query:: - - PPL> source=accounts | stats bucket_nullable=false count() as cnt by email; - fetched rows / total rows = 3/3 - +-----+-----------------------+ - | cnt | email | - |-----+-----------------------| - | 1 | amberduke@pyrami.com | - | 1 | daleadams@boink.com | - | 1 | hattiebond@netagy.com | - +-----+-----------------------+ - -Example 16: Collect unique values in a field using VALUES -========================================================== - -The example shows how to collect all unique firstname values, sorted lexicographically with duplicates removed. - -PPL query:: - - PPL> source=accounts | stats values(firstname); - fetched rows / total rows = 1/1 - +-------------------------------------+ - | values(firstname) | - |-------------------------------------| - | ["Amber","Dale","Hattie","Nanette"] | - +-------------------------------------+ - - -Example 17: Span on date/time field always ignore null bucket -============================================================= - -Index example data: - -+-------+--------+------------+ -| Name | DEPTNO | birthday | -+=======+========+============+ -| Alice | 1 | 2024-04-21 | -+-------+--------+------------+ -| Bob | 2 | 2025-08-21 | -+-------+--------+------------+ -| Jeff | null | 2025-04-22 | -+-------+--------+------------+ -| Adam | 2 | null | -+-------+--------+------------+ - -PPL query:: - - PPL> source=example | stats count() as cnt by span(birthday, 1y) as year; - fetched rows / total rows = 3/3 - +-----+------------+ - | cnt | year | - |-----+------------| - | 1 | 2024-01-01 | - | 2 | 2025-01-01 | - +-----+------------+ - - -PPL query:: - - PPL> source=example | stats count() as cnt by span(birthday, 1y) as year, DEPTNO; - fetched rows / total rows = 3/3 - +-----+------------+--------+ - | cnt | year | DEPTNO | - |-----+------------+--------| - | 1 | 2024-01-01 | 1 | - | 1 | 2025-01-01 | 2 | - | 1 | 2025-01-01 | null | - +-----+------------+--------+ - - -PPL query:: - - PPL> source=example | stats bucket_nullable=false count() as cnt by span(birthday, 1y) as year, DEPTNO; - fetched rows / total rows = 3/3 - +-----+------------+--------+ - | cnt | year | DEPTNO | - |-----+------------+--------| - | 1 | 2024-01-01 | 1 | - | 1 | 2025-01-01 | 2 | - +-----+------------+--------+ - - -Example 18: Calculate the count by the implicit @timestamp field -================================================================ - -This example demonstrates that if you omit the field parameter in the span function, it will automatically use the implicit ``@timestamp`` field. - -PPL query:: - - PPL> source=big5 | stats count() by span(1month) - fetched rows / total rows = 1/1 - +---------+---------------------+ - | count() | span(1month) | - |---------+---------------------| - | 1 | 2023-01-01 00:00:00 | - +---------+---------------------+ diff --git a/docs/user/ppl/cmd/streamstats.md b/docs/user/ppl/cmd/streamstats.md new file mode 100644 index 00000000000..c7f79b21339 --- /dev/null +++ b/docs/user/ppl/cmd/streamstats.md @@ -0,0 +1,281 @@ +# streamstats + +## Description + +The `streamstats` command is used to calculate cumulative or rolling statistics as events are processed in order. Unlike `stats` or `eventstats` which operate on the entire dataset at once, it computes values incrementally on a per-event basis, often respecting the order of events in the search results. It allows you to generate running totals, moving averages, and other statistics that evolve with the stream of events. +Key aspects of `streamstats`: +1. It computes statistics incrementally as each event is processed, making it suitable for time-series and sequence-based analysis. +2. Supports arguments such as window (for sliding window calculations) and current (to control whether the current event included in calculation). +3. Retains all original events and appends new fields containing the calculated statistics. +4. Particularly useful for calculating running totals, identifying trends, or detecting changes over sequences of events. + +Difference between `stats`, `eventstats` and `streamstats` +All of these commands can be used to generate aggregations such as average, sum, and maximum, but they have some key differences in how they operate and what they produce: +* Transformation Behavior + * `stats`: Transforms all events into an aggregated result table, losing original event structure. + * `eventstats`: Adds aggregation results as new fields to the original events without removing the event structure. + * `streamstats`: Adds cumulative (running) aggregation results to each event as they stream through the pipeline. +* Output Format + * `stats`: Output contains only aggregated values. Original raw events are not preserved. + * `eventstats`: Original events remain, with extra fields containing summary statistics. + * `streamstats`: Original events remain, with extra fields containing running totals or cumulative statistics. +* Aggregation Scope + * `stats`: Based on all events in the search (or groups defined by BY clause). + * `eventstats`: Based on all relevant events, then the result is added back to each event in the group. + * `streamstats`: Calculations occur progressively as each event is processed; can be scoped by window. +* Use Cases + * `stats`: When only aggregated results are needed (e.g., counts, averages, sums). + * `eventstats`: When aggregated statistics are needed alongside original event data. + * `streamstats`: When a running total or cumulative statistic is needed across event streams. + +## Syntax + +streamstats [bucket_nullable=bool] [current=\] [window=\] [global=\] [reset_before="("\")"] [reset_after="("\")"] \... [by-clause] +* function: mandatory. A aggregation function or window function. +* bucket_nullable: optional. Controls whether the streamstats command consider null buckets as a valid group in group-by aggregations. When set to `false`, it will not treat null group-by values as a distinct group during aggregation. **Default:** Determined by `plugins.ppl.syntax.legacy.preferred`. + * When `plugins.ppl.syntax.legacy.preferred=true`, `bucket_nullable` defaults to `true` + * When `plugins.ppl.syntax.legacy.preferred=false`, `bucket_nullable` defaults to `false` +* current: optional. If true, the search includes the given, or current, event in the summary calculations. If false, the search uses the field value from the previous event. Syntax: current=\. **Default:** true. +* window: optional. Specifies the number of events to use when computing the statistics. Syntax: window=\. **Default:** 0, which means that all previous and current events are used. +* global: optional. Used only when the window argument is set. Defines whether to use a single window, global=true, or to use separate windows based on the by clause. If global=false and window is set to a non-zero value, a separate window is used for each group of values of the field specified in the by clause. Syntax: global=\. **Default:** true. +* reset_before: optional. Before streamstats calculates for an event, reset_before resets all accumulated statistics when the eval-expression evaluates to true. If used with window, the window is also reset. Syntax: reset_before="("\")". **Default:** false. +* reset_after: optional. After streamstats calculations for an event, reset_after resets all accumulated statistics when the eval-expression evaluates to true. This expression can reference fields returned by streamstats. If used with window, the window is also reset. Syntax: reset_after="("\")". **Default:** false. +* by-clause: optional. The by clause could be the fields and expressions like scalar functions and aggregation functions. Besides, the span clause can be used to split specific field into buckets in the same interval, the stats then does the aggregation by these span buckets. Syntax: by [span-expression,] [field,]... **Default:** If no \ is specified, all events are processed as a single group and running statistics are computed across the entire event stream. +* span-expression: optional, at most one. Splits field into buckets by intervals. Syntax: span(field_expr, interval_expr). For example, `span(age, 10)` creates 10-year age buckets, `span(timestamp, 1h)` creates hourly buckets. + * Available time units + * millisecond (ms) + * second (s) + * minute (m, case sensitive) + * hour (h) + * day (d) + * week (w) + * month (M, case sensitive) + * quarter (q) + * year (y) + +## Aggregation Functions + +The streamstats command supports the following aggregation functions: +* COUNT: Count of values +* SUM: Sum of numeric values +* AVG: Average of numeric values +* MAX: Maximum value +* MIN: Minimum value +* VAR_SAMP: Sample variance +* VAR_POP: Population variance +* STDDEV_SAMP: Sample standard deviation +* STDDEV_POP: Population standard deviation +* DISTINCT_COUNT/DC: Distinct count of values +* EARLIEST: Earliest value by timestamp +* LATEST: Latest value by timestamp + +For detailed documentation of each function, see [Aggregation Functions](../functions/aggregations.md). +## Usage + +Streamstats + +``` +source = table | streamstats avg(a) +source = table | streamstats current = false avg(a) +source = table | streamstats window = 5 sum(b) +source = table | streamstats current = false window = 2 max(a) +source = table | where a < 50 | streamstats count(c) +source = table | streamstats min(c), max(c) by b +source = table | streamstats count(c) as count_by by b | where count_by > 1000 +source = table | streamstats dc(field) as distinct_count +source = table | streamstats distinct_count(category) by region +source = table | streamstats current=false window=2 global=false avg(a) by b +source = table | streamstats window=2 reset_before=a>31 avg(b) +source = table | streamstats current=false reset_after=a>31 avg(b) by c +``` + +## Example 1: Calculate the running average, sum, and count of a field by group + +This example calculates the running average age, running sum of age, and running count of events for all the accounts, grouped by gender. + +```ppl +source=accounts +| streamstats avg(age) as running_avg, sum(age) as running_sum, count() as running_count by gender +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+--------------------+-------------+---------------+ +| account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | running_avg | running_sum | running_count | +|----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+--------------------+-------------+---------------| +| 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | 32.0 | 32 | 1 | +| 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | 34.0 | 68 | 2 | +| 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates | 28.0 | 28 | 1 | +| 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | 33.666666666666664 | 101 | 3 | ++----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+--------------------+-------------+---------------+ +``` + +## Example 2: Running maximum age over a 2-row window + +This example calculates the running maximum age over a 2-row window, excluding the current event. + +```ppl +source=state_country +| streamstats current=false window=2 max(age) as prev_max_age +``` + +Expected output: + +```text +fetched rows / total rows = 8/8 ++-------+---------+------------+-------+------+-----+--------------+ +| name | country | state | month | year | age | prev_max_age | +|-------+---------+------------+-------+------+-----+--------------| +| Jake | USA | California | 4 | 2023 | 70 | null | +| Hello | USA | New York | 4 | 2023 | 30 | 70 | +| John | Canada | Ontario | 4 | 2023 | 25 | 70 | +| Jane | Canada | Quebec | 4 | 2023 | 20 | 30 | +| Jim | Canada | B.C | 4 | 2023 | 27 | 25 | +| Peter | Canada | B.C | 4 | 2023 | 57 | 27 | +| Rick | Canada | B.C | 4 | 2023 | 70 | 57 | +| David | USA | Washington | 4 | 2023 | 40 | 70 | ++-------+---------+------------+-------+------+-----+--------------+ +``` + +## Example 3: Use the global argument to calculate running statistics + +The global argument is only applicable when a window argument is set. It defines how the window is applied in relation to the grouping fields: +* global=true: a global window is applied across all rows, but the calculations inside the window still respect the by groups. +* global=false: the window itself is created per group, meaning each group gets its own independent window. + +This example shows how to calculate the running average of age across accounts by country, using global argument. +original data + +-------+---------+------------+-------+------+-----+ + | name | country | state | month | year | age | + + |-------+---------+------------+-------+------+-----+ + | Jake | USA | California | 4 | 2023 | 70 | + | Hello | USA | New York | 4 | 2023 | 30 | + | John | Canada | Ontario | 4 | 2023 | 25 | + | Jane | Canada | Quebec | 4 | 2023 | 20 | + | Jim | Canada | B.C | 4 | 2023 | 27 | + | Peter | Canada | B.C | 4 | 2023 | 57 | + | Rick | Canada | B.C | 4 | 2023 | 70 | + | David | USA | Washington | 4 | 2023 | 40 | + + +-------+---------+------------+-------+------+-----+ +* global=true: The window slides across all rows globally (following their input order), but inside each window, aggregation is still computed by country. So we process the data stream row by row to build the sliding window with size 2. We can see that David and Rick are in a window. +* global=false: Each by group (country) forms its own independent stream and window (size 2). So David and Hello are in one window for USA. This time we get running_avg 35 for David, rather than 40 when global is set true. + +```ppl +source=state_country +| streamstats window=2 global=true avg(age) as running_avg by country +``` + +Expected output: + +```text +fetched rows / total rows = 8/8 ++-------+---------+------------+-------+------+-----+-------------+ +| name | country | state | month | year | age | running_avg | +|-------+---------+------------+-------+------+-----+-------------| +| Jake | USA | California | 4 | 2023 | 70 | 70.0 | +| Hello | USA | New York | 4 | 2023 | 30 | 50.0 | +| John | Canada | Ontario | 4 | 2023 | 25 | 25.0 | +| Jane | Canada | Quebec | 4 | 2023 | 20 | 22.5 | +| Jim | Canada | B.C | 4 | 2023 | 27 | 23.5 | +| Peter | Canada | B.C | 4 | 2023 | 57 | 42.0 | +| Rick | Canada | B.C | 4 | 2023 | 70 | 63.5 | +| David | USA | Washington | 4 | 2023 | 40 | 40.0 | ++-------+---------+------------+-------+------+-----+-------------+ +``` + +```ppl +source=state_country +| streamstats window=2 global=false avg(age) as running_avg by country ; +``` + +Expected output: + +```text +fetched rows / total rows = 8/8 ++-------+---------+------------+-------+------+-----+-------------+ +| name | country | state | month | year | age | running_avg | +|-------+---------+------------+-------+------+-----+-------------| +| Jake | USA | California | 4 | 2023 | 70 | 70.0 | +| Hello | USA | New York | 4 | 2023 | 30 | 50.0 | +| John | Canada | Ontario | 4 | 2023 | 25 | 25.0 | +| Jane | Canada | Quebec | 4 | 2023 | 20 | 22.5 | +| Jim | Canada | B.C | 4 | 2023 | 27 | 23.5 | +| Peter | Canada | B.C | 4 | 2023 | 57 | 42.0 | +| Rick | Canada | B.C | 4 | 2023 | 70 | 63.5 | +| David | USA | Washington | 4 | 2023 | 40 | 35.0 | ++-------+---------+------------+-------+------+-----+-------------+ +``` + +## Example 4: Use the reset_before and reset_after arguments to reset statistics + +This example calculates the running average of age across accounts by country, with resets applied. + +```ppl +source=state_country +| streamstats current=false reset_before=age>34 reset_after=age<25 avg(age) as avg_age by country +``` + +Expected output: + +```text +fetched rows / total rows = 8/8 ++-------+---------+------------+-------+------+-----+---------+ +| name | country | state | month | year | age | avg_age | +|-------+---------+------------+-------+------+-----+---------| +| Jake | USA | California | 4 | 2023 | 70 | null | +| Hello | USA | New York | 4 | 2023 | 30 | 70.0 | +| John | Canada | Ontario | 4 | 2023 | 25 | null | +| Jane | Canada | Quebec | 4 | 2023 | 20 | 25.0 | +| Jim | Canada | B.C | 4 | 2023 | 27 | null | +| Peter | Canada | B.C | 4 | 2023 | 57 | null | +| Rick | Canada | B.C | 4 | 2023 | 70 | null | +| David | USA | Washington | 4 | 2023 | 40 | null | ++-------+---------+------------+-------+------+-----+---------+ +``` + +## Example 5: Null buckets handling + +```ppl +source=accounts +| streamstats bucket_nullable=false count() as cnt by employer +| fields account_number, firstname, employer, cnt +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+-----------+----------+------+ +| account_number | firstname | employer | cnt | +|----------------+-----------+----------+------| +| 1 | Amber | Pyrami | 1 | +| 6 | Hattie | Netagy | 1 | +| 13 | Nanette | Quility | 1 | +| 18 | Dale | null | null | ++----------------+-----------+----------+------+ +``` + +```ppl +source=accounts +| streamstats bucket_nullable=true count() as cnt by employer +| fields account_number, firstname, employer, cnt +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+-----------+----------+-----+ +| account_number | firstname | employer | cnt | +|----------------+-----------+----------+-----| +| 1 | Amber | Pyrami | 1 | +| 6 | Hattie | Netagy | 1 | +| 13 | Nanette | Quility | 1 | +| 18 | Dale | null | 1 | ++----------------+-----------+----------+-----+ +``` + \ No newline at end of file diff --git a/docs/user/ppl/cmd/streamstats.rst b/docs/user/ppl/cmd/streamstats.rst deleted file mode 100644 index e38df779791..00000000000 --- a/docs/user/ppl/cmd/streamstats.rst +++ /dev/null @@ -1,273 +0,0 @@ -=========== -streamstats -=========== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -The ``streamstats`` command is used to calculate cumulative or rolling statistics as events are processed in order. Unlike ``stats`` or ``eventstats`` which operate on the entire dataset at once, it computes values incrementally on a per-event basis, often respecting the order of events in the search results. It allows you to generate running totals, moving averages, and other statistics that evolve with the stream of events. - -Key aspects of `streamstats`: - -1. It computes statistics incrementally as each event is processed, making it suitable for time-series and sequence-based analysis. -2. Supports arguments such as window (for sliding window calculations) and current (to control whether the current event included in calculation). -3. Retains all original events and appends new fields containing the calculated statistics. -4. Particularly useful for calculating running totals, identifying trends, or detecting changes over sequences of events. - -Difference between ``stats``, ``eventstats`` and ``streamstats`` - -All of these commands can be used to generate aggregations such as average, sum, and maximum, but they have some key differences in how they operate and what they produce: - -* Transformation Behavior - - * ``stats``: Transforms all events into an aggregated result table, losing original event structure. - * ``eventstats``: Adds aggregation results as new fields to the original events without removing the event structure. - * ``streamstats``: Adds cumulative (running) aggregation results to each event as they stream through the pipeline. - -* Output Format - - * ``stats``: Output contains only aggregated values. Original raw events are not preserved. - * ``eventstats``: Original events remain, with extra fields containing summary statistics. - * ``streamstats``: Original events remain, with extra fields containing running totals or cumulative statistics. - -* Aggregation Scope - - * ``stats``: Based on all events in the search (or groups defined by BY clause). - * ``eventstats``: Based on all relevant events, then the result is added back to each event in the group. - * ``streamstats``: Calculations occur progressively as each event is processed; can be scoped by window. - -* Use Cases - - * ``stats``: When only aggregated results are needed (e.g., counts, averages, sums). - * ``eventstats``: When aggregated statistics are needed alongside original event data. - * ``streamstats``: When a running total or cumulative statistic is needed across event streams. - -Syntax -====== -streamstats [bucket_nullable=bool] [current=] [window=] [global=] [reset_before="("")"] [reset_after="("")"] ... [by-clause] - -* function: mandatory. A aggregation function or window function. -* bucket_nullable: optional. Controls whether the streamstats command consider null buckets as a valid group in group-by aggregations. When set to ``false``, it will not treat null group-by values as a distinct group during aggregation. **Default:** Determined by ``plugins.ppl.syntax.legacy.preferred``. - - * When ``plugins.ppl.syntax.legacy.preferred=true``, ``bucket_nullable`` defaults to ``true`` - * When ``plugins.ppl.syntax.legacy.preferred=false``, ``bucket_nullable`` defaults to ``false`` - -* current: optional. If true, the search includes the given, or current, event in the summary calculations. If false, the search uses the field value from the previous event. Syntax: current=. **Default:** true. -* window: optional. Specifies the number of events to use when computing the statistics. Syntax: window=. **Default:** 0, which means that all previous and current events are used. -* global: optional. Used only when the window argument is set. Defines whether to use a single window, global=true, or to use separate windows based on the by clause. If global=false and window is set to a non-zero value, a separate window is used for each group of values of the field specified in the by clause. Syntax: global=. **Default:** true. -* reset_before: optional. Before streamstats calculates for an event, reset_before resets all accumulated statistics when the eval-expression evaluates to true. If used with window, the window is also reset. Syntax: reset_before="("")". **Default:** false. -* reset_after: optional. After streamstats calculations for an event, reset_after resets all accumulated statistics when the eval-expression evaluates to true. This expression can reference fields returned by streamstats. If used with window, the window is also reset. Syntax: reset_after="("")". **Default:** false. -* by-clause: optional. The by clause could be the fields and expressions like scalar functions and aggregation functions. Besides, the span clause can be used to split specific field into buckets in the same interval, the stats then does the aggregation by these span buckets. Syntax: by [span-expression,] [field,]... **Default:** If no is specified, all events are processed as a single group and running statistics are computed across the entire event stream. -* span-expression: optional, at most one. Splits field into buckets by intervals. Syntax: span(field_expr, interval_expr). For example, ``span(age, 10)`` creates 10-year age buckets, ``span(timestamp, 1h)`` creates hourly buckets. - - * Available time units - - * millisecond (ms) - * second (s) - * minute (m, case sensitive) - * hour (h) - * day (d) - * week (w) - * month (M, case sensitive) - * quarter (q) - * year (y) - -Aggregation Functions -===================== - -The streamstats command supports the following aggregation functions: - -* COUNT: Count of values -* SUM: Sum of numeric values -* AVG: Average of numeric values -* MAX: Maximum value -* MIN: Minimum value -* VAR_SAMP: Sample variance -* VAR_POP: Population variance -* STDDEV_SAMP: Sample standard deviation -* STDDEV_POP: Population standard deviation -* DISTINCT_COUNT/DC: Distinct count of values -* EARLIEST: Earliest value by timestamp -* LATEST: Latest value by timestamp - -For detailed documentation of each function, see `Aggregation Functions <../functions/aggregations.rst>`_. - -Usage -===== - -Streamstats:: - - source = table | streamstats avg(a) - source = table | streamstats current = false avg(a) - source = table | streamstats window = 5 sum(b) - source = table | streamstats current = false window = 2 max(a) - source = table | where a < 50 | streamstats count(c) - source = table | streamstats min(c), max(c) by b - source = table | streamstats count(c) as count_by by b | where count_by > 1000 - source = table | streamstats dc(field) as distinct_count - source = table | streamstats distinct_count(category) by region - source = table | streamstats current=false window=2 global=false avg(a) by b - source = table | streamstats window=2 reset_before=a>31 avg(b) - source = table | streamstats current=false reset_after=a>31 avg(b) by c - - -Example 1: Calculate the running average, sum, and count of a field by group -============================================================================ - -This example calculates the running average age, running sum of age, and running count of events for all the accounts, grouped by gender. - -PPL query:: - - os> source=accounts | streamstats avg(age) as running_avg, sum(age) as running_sum, count() as running_count by gender; - fetched rows / total rows = 4/4 - +----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+--------------------+-------------+---------------+ - | account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | running_avg | running_sum | running_count | - |----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+--------------------+-------------+---------------| - | 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | 32.0 | 32 | 1 | - | 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | 34.0 | 68 | 2 | - | 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates | 28.0 | 28 | 1 | - | 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | 33.666666666666664 | 101 | 3 | - +----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+--------------------+-------------+---------------+ - - -Example 2: Running maximum age over a 2-row window -================================================== - -This example calculates the running maximum age over a 2-row window, excluding the current event. - -PPL query:: - - os> source=state_country | streamstats current=false window=2 max(age) as prev_max_age - fetched rows / total rows = 8/8 - +-------+---------+------------+-------+------+-----+--------------+ - | name | country | state | month | year | age | prev_max_age | - |-------+---------+------------+-------+------+-----+--------------| - | Jake | USA | California | 4 | 2023 | 70 | null | - | Hello | USA | New York | 4 | 2023 | 30 | 70 | - | John | Canada | Ontario | 4 | 2023 | 25 | 70 | - | Jane | Canada | Quebec | 4 | 2023 | 20 | 30 | - | Jim | Canada | B.C | 4 | 2023 | 27 | 25 | - | Peter | Canada | B.C | 4 | 2023 | 57 | 27 | - | Rick | Canada | B.C | 4 | 2023 | 70 | 57 | - | David | USA | Washington | 4 | 2023 | 40 | 70 | - +-------+---------+------------+-------+------+-----+--------------+ - - -Example 3: Use the global argument to calculate running statistics -================================================================== - -The global argument is only applicable when a window argument is set. It defines how the window is applied in relation to the grouping fields: - -* global=true: a global window is applied across all rows, but the calculations inside the window still respect the by groups. -* global=false: the window itself is created per group, meaning each group gets its own independent window. - -This example shows how to calculate the running average of age across accounts by country, using global argument. - -original data:: - - +-------+---------+------------+-------+------+-----+ - | name | country | state | month | year | age | - |-------+---------+------------+-------+------+-----+ - | Jake | USA | California | 4 | 2023 | 70 | - | Hello | USA | New York | 4 | 2023 | 30 | - | John | Canada | Ontario | 4 | 2023 | 25 | - | Jane | Canada | Quebec | 4 | 2023 | 20 | - | Jim | Canada | B.C | 4 | 2023 | 27 | - | Peter | Canada | B.C | 4 | 2023 | 57 | - | Rick | Canada | B.C | 4 | 2023 | 70 | - | David | USA | Washington | 4 | 2023 | 40 | - +-------+---------+------------+-------+------+-----+ - -* global=true: The window slides across all rows globally (following their input order), but inside each window, aggregation is still computed by country. So we process the data stream row by row to build the sliding window with size 2. We can see that David and Rick are in a window. -* global=false: Each by group (country) forms its own independent stream and window (size 2). So David and Hello are in one window for USA. This time we get running_avg 35 for David, rather than 40 when global is set true. - -PPL query:: - - os> source=state_country | streamstats window=2 global=true avg(age) as running_avg by country ; - fetched rows / total rows = 8/8 - +-------+---------+------------+-------+------+-----+-------------+ - | name | country | state | month | year | age | running_avg | - |-------+---------+------------+-------+------+-----+-------------| - | Jake | USA | California | 4 | 2023 | 70 | 70.0 | - | Hello | USA | New York | 4 | 2023 | 30 | 50.0 | - | John | Canada | Ontario | 4 | 2023 | 25 | 25.0 | - | Jane | Canada | Quebec | 4 | 2023 | 20 | 22.5 | - | Jim | Canada | B.C | 4 | 2023 | 27 | 23.5 | - | Peter | Canada | B.C | 4 | 2023 | 57 | 42.0 | - | Rick | Canada | B.C | 4 | 2023 | 70 | 63.5 | - | David | USA | Washington | 4 | 2023 | 40 | 40.0 | - +-------+---------+------------+-------+------+-----+-------------+ - - os> source=state_country | streamstats window=2 global=false avg(age) as running_avg by country ; - fetched rows / total rows = 8/8 - +-------+---------+------------+-------+------+-----+-------------+ - | name | country | state | month | year | age | running_avg | - |-------+---------+------------+-------+------+-----+-------------| - | Jake | USA | California | 4 | 2023 | 70 | 70.0 | - | Hello | USA | New York | 4 | 2023 | 30 | 50.0 | - | John | Canada | Ontario | 4 | 2023 | 25 | 25.0 | - | Jane | Canada | Quebec | 4 | 2023 | 20 | 22.5 | - | Jim | Canada | B.C | 4 | 2023 | 27 | 23.5 | - | Peter | Canada | B.C | 4 | 2023 | 57 | 42.0 | - | Rick | Canada | B.C | 4 | 2023 | 70 | 63.5 | - | David | USA | Washington | 4 | 2023 | 40 | 35.0 | - +-------+---------+------------+-------+------+-----+-------------+ - - -Example 4: Use the reset_before and reset_after arguments to reset statistics -============================================================================= - -This example calculates the running average of age across accounts by country, with resets applied. - -PPL query:: - - os> source=state_country | streamstats current=false reset_before=age>34 reset_after=age<25 avg(age) as avg_age by country; - fetched rows / total rows = 8/8 - +-------+---------+------------+-------+------+-----+---------+ - | name | country | state | month | year | age | avg_age | - |-------+---------+------------+-------+------+-----+---------| - | Jake | USA | California | 4 | 2023 | 70 | null | - | Hello | USA | New York | 4 | 2023 | 30 | 70.0 | - | John | Canada | Ontario | 4 | 2023 | 25 | null | - | Jane | Canada | Quebec | 4 | 2023 | 20 | 25.0 | - | Jim | Canada | B.C | 4 | 2023 | 27 | null | - | Peter | Canada | B.C | 4 | 2023 | 57 | null | - | Rick | Canada | B.C | 4 | 2023 | 70 | null | - | David | USA | Washington | 4 | 2023 | 40 | null | - +-------+---------+------------+-------+------+-----+---------+ - - -Example 5: Null buckets handling -================================ - -PPL query:: - - os> source=accounts | streamstats bucket_nullable=false count() as cnt by employer | fields account_number, firstname, employer, cnt; - fetched rows / total rows = 4/4 - +----------------+-----------+----------+------+ - | account_number | firstname | employer | cnt | - |----------------+-----------+----------+------| - | 1 | Amber | Pyrami | 1 | - | 6 | Hattie | Netagy | 1 | - | 13 | Nanette | Quility | 1 | - | 18 | Dale | null | null | - +----------------+-----------+----------+------+ - -PPL query:: - - os> source=accounts | streamstats bucket_nullable=true count() as cnt by employer | fields account_number, firstname, employer, cnt; - fetched rows / total rows = 4/4 - +----------------+-----------+----------+-----+ - | account_number | firstname | employer | cnt | - |----------------+-----------+----------+-----| - | 1 | Amber | Pyrami | 1 | - | 6 | Hattie | Netagy | 1 | - | 13 | Nanette | Quility | 1 | - | 18 | Dale | null | 1 | - +----------------+-----------+----------+-----+ \ No newline at end of file diff --git a/docs/user/ppl/cmd/subquery.md b/docs/user/ppl/cmd/subquery.md new file mode 100644 index 00000000000..aa33fbbb119 --- /dev/null +++ b/docs/user/ppl/cmd/subquery.md @@ -0,0 +1,197 @@ +# subquery + +## Description + +The `subquery` command allows you to embed one PPL query inside another, enabling complex filtering and data retrieval operations. A subquery is a nested query that executes first and returns results that are used by the outer query for filtering, comparison, or joining operations. +Subqueries are useful for: +1. Filtering data based on results from another query +2. Checking for the existence of related data +3. Performing calculations that depend on aggregated values from other tables +4. Creating complex joins with dynamic conditions + +## Syntax + +subquery: [ source=... \| ... \| ... ] + +Subqueries use the same syntax as regular PPL queries but must be enclosed in square brackets. There are four main types of subqueries: + +**IN Subquery** +Tests whether a field value exists in the results of a subquery: + +```sql ignore +where [not] in [ source=... | ... | ... ] +``` + +**EXISTS Subquery** +Tests whether a subquery returns any results: + +```sql ignore +where [not] exists [ source=... | ... | ... ] +``` + +**Scalar Subquery** +Returns a single value that can be used in comparisons or calculations + +```sql ignore +where = [ source=... | ... | ... ] +``` + +**Relation Subquery** +Used in join operations to provide dynamic right-side data + +```sql ignore +| join ON condition [ source=... | ... | ... ] +``` + +## Configuration + +### plugins.ppl.subsearch.maxout + +The size configures the maximum of rows to return from subsearch. The default value is: `10000`. A value of `0` indicates that the restriction is unlimited. + +Change the subsearch.maxout to unlimited: + +```bash ignore +sh$ curl -sS -H 'Content-Type: application/json' \ +... -X PUT localhost:9200/_plugins/_query/settings \ +... -d '{"persistent" : {"plugins.ppl.subsearch.maxout" : "0"}}' +{ + "acknowledged": true, + "persistent": { + "plugins": { + "ppl": { + "subsearch": { + "maxout": "-1" + } + } + } + }, + "transient": {} +} +``` + +## Usage + +InSubquery: + +``` +source = outer | where a in [ source = inner | fields b ] +source = outer | where (a) in [ source = inner | fields b ] +source = outer | where (a,b,c) in [ source = inner | fields d,e,f ] +source = outer | where a not in [ source = inner | fields b ] +source = outer | where (a) not in [ source = inner | fields b ] +source = outer | where (a,b,c) not in [ source = inner | fields d,e,f ] +source = outer a in [ source = inner | fields b ] // search filtering with subquery +source = outer a not in [ source = inner | fields b ] // search filtering with subquery) +source = outer | where a in [ source = inner1 | where b not in [ source = inner2 | fields c ] | fields b ] // nested +source = table1 | inner join left = l right = r on l.a = r.a AND r.a in [ source = inner | fields d ] | fields l.a, r.a, b, c //as join filter +``` + +ExistsSubquery: + +``` +// Assumptions: `a`, `b` are fields of table outer, `c`, `d` are fields of table inner, `e`, `f` are fields of table nested +source = outer | where exists [ source = inner | where a = c ] +source = outer | where not exists [ source = inner | where a = c ] +source = outer | where exists [ source = inner | where a = c and b = d ] +source = outer | where not exists [ source = inner | where a = c and b = d ] +source = outer exists [ source = inner | where a = c ] // search filtering with subquery +source = outer not exists [ source = inner | where a = c ] //search filtering with subquery +source = table as t1 exists [ source = table as t2 | where t1.a = t2.a ] //table alias is useful in exists subquery +source = outer | where exists [ source = inner1 | where a = c and exists [ source = nested | where c = e ] ] //nested +source = outer | where exists [ source = inner1 | where a = c | where exists [ source = nested | where c = e ] ] //nested +source = outer | where exists [ source = inner | where c > 10 ] //uncorrelated exists +source = outer | where not exists [ source = inner | where c > 10 ] //uncorrelated exists +source = outer | where exists [ source = inner ] | eval l = "nonEmpty" | fields l //special uncorrelated exists +``` + +ScalarSubquery: + +``` +//Uncorrelated scalar subquery in Select +source = outer | eval m = [ source = inner | stats max(c) ] | fields m, a +source = outer | eval m = [ source = inner | stats max(c) ] + b | fields m, a +//Uncorrelated scalar subquery in Where** +source = outer | where a > [ source = inner | stats min(c) ] | fields a +//Uncorrelated scalar subquery in Search filter +source = outer a > [ source = inner | stats min(c) ] | fields a +//Correlated scalar subquery in Select +source = outer | eval m = [ source = inner | where outer.b = inner.d | stats max(c) ] | fields m, a +source = outer | eval m = [ source = inner | where b = d | stats max(c) ] | fields m, a +source = outer | eval m = [ source = inner | where outer.b > inner.d | stats max(c) ] | fields m, a +//Correlated scalar subquery in Where +source = outer | where a = [ source = inner | where outer.b = inner.d | stats max(c) ] +source = outer | where a = [ source = inner | where b = d | stats max(c) ] +source = outer | where [ source = inner | where outer.b = inner.d OR inner.d = 1 | stats count() ] > 0 | fields a +//Correlated scalar subquery in Search filter +source = outer a = [ source = inner | where b = d | stats max(c) ] +source = outer [ source = inner | where outer.b = inner.d OR inner.d = 1 | stats count() ] > 0 | fields a +//Nested scalar subquery +source = outer | where a = [ source = inner | stats max(c) | sort c ] OR b = [ source = inner | where c = 1 | stats min(d) | sort d ] +source = outer | where a = [ source = inner | where c = [ source = nested | stats max(e) by f | sort f ] | stats max(d) by c | sort c | head 1 ] +RelationSubquery +source = table1 | join left = l right = r on condition [ source = table2 | where d > 10 | head 5 ] //subquery in join right side +source = [ source = table1 | join left = l right = r [ source = table2 | where d > 10 | head 5 ] | stats count(a) by b ] as outer | head 1 +``` + +## Example 1: TPC-H q20 + +This example shows a complex TPC-H query 20 implementation using nested subqueries. + +```bash ignore +curl -H 'Content-Type: application/json' -X POST localhost:9200/_plugins/_ppl -d '{ + "query" : """ + source = supplier + | join ON s_nationkey = n_nationkey nation + | where n_name = 'CANADA' + and s_suppkey in [ + source = partsupp + | where ps_partkey in [ + source = part + | where like(p_name, 'forest%') + | fields p_partkey + ] + and ps_availqty > [ + source = lineitem + | where l_partkey = ps_partkey + and l_suppkey = ps_suppkey + and l_shipdate >= date('1994-01-01') + and l_shipdate < date_add(date('1994-01-01'), interval 1 year) + | stats sum(l_quantity) as sum_l_quantity + | eval half_sum_l_quantity = 0.5 * sum_l_quantity // Stats and Eval commands can combine when issues/819 resolved + | fields half_sum_l_quantity + ] + | fields ps_suppkey + ] + """ +}' +``` + +## Example 2: TPC-H q22 + +This example shows a TPC-H query 22 implementation using EXISTS and scalar subqueries. + +```bash ignore +curl -H 'Content-Type: application/json' -X POST localhost:9200/_plugins/_ppl -d '{ + "query" : """ + source = [ + source = customer + | where substring(c_phone, 1, 2) in ('13', '31', '23', '29', '30', '18', '17') + and c_acctbal > [ + source = customer + | where c_acctbal > 0.00 + and substring(c_phone, 1, 2) in ('13', '31', '23', '29', '30', '18', '17') + | stats avg(c_acctbal) + ] + and not exists [ + source = orders + | where o_custkey = c_custkey + ] + | eval cntrycode = substring(c_phone, 1, 2) + | fields cntrycode, c_acctbal + ] as custsale + | stats count() as numcust, sum(c_acctbal) as totacctbal by cntrycode + | sort cntrycode + """ + }' + ``` \ No newline at end of file diff --git a/docs/user/ppl/cmd/subquery.rst b/docs/user/ppl/cmd/subquery.rst deleted file mode 100644 index 48491db22e2..00000000000 --- a/docs/user/ppl/cmd/subquery.rst +++ /dev/null @@ -1,206 +0,0 @@ -======== -subquery -======== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| The ``subquery`` command allows you to embed one PPL query inside another, enabling complex filtering and data retrieval operations. A subquery is a nested query that executes first and returns results that are used by the outer query for filtering, comparison, or joining operations. - -| Subqueries are useful for: - -1. Filtering data based on results from another query -2. Checking for the existence of related data -3. Performing calculations that depend on aggregated values from other tables -4. Creating complex joins with dynamic conditions - -Syntax -====== -subquery: [ source=... | ... | ... ] - -Subqueries use the same syntax as regular PPL queries but must be enclosed in square brackets. There are four main types of subqueries: - -**IN Subquery** -Tests whether a field value exists in the results of a subquery:: - - where [not] in [ source=... | ... | ... ] - -**EXISTS Subquery** -Tests whether a subquery returns any results:: - - where [not] exists [ source=... | ... | ... ] - -**Scalar Subquery** -Returns a single value that can be used in comparisons or calculations:: - - where = [ source=... | ... | ... ] - -**Relation Subquery** -Used in join operations to provide dynamic right-side data:: - - | join ON condition [ source=... | ... | ... ] - -Configuration -============= - -plugins.ppl.subsearch.maxout ----------------------------- - -The size configures the maximum of rows to return from subsearch. The default value is: ``10000``. A value of ``0`` indicates that the restriction is unlimited. - -Change the subsearch.maxout to unlimited:: - - sh$ curl -sS -H 'Content-Type: application/json' \ - ... -X PUT localhost:9200/_plugins/_query/settings \ - ... -d '{"persistent" : {"plugins.ppl.subsearch.maxout" : "0"}}' - { - "acknowledged": true, - "persistent": { - "plugins": { - "ppl": { - "subsearch": { - "maxout": "-1" - } - } - } - }, - "transient": {} - } - -Usage -===== - -InSubquery:: - - source = outer | where a in [ source = inner | fields b ] - source = outer | where (a) in [ source = inner | fields b ] - source = outer | where (a,b,c) in [ source = inner | fields d,e,f ] - source = outer | where a not in [ source = inner | fields b ] - source = outer | where (a) not in [ source = inner | fields b ] - source = outer | where (a,b,c) not in [ source = inner | fields d,e,f ] - source = outer a in [ source = inner | fields b ] // search filtering with subquery - source = outer a not in [ source = inner | fields b ] // search filtering with subquery) - source = outer | where a in [ source = inner1 | where b not in [ source = inner2 | fields c ] | fields b ] // nested - source = table1 | inner join left = l right = r on l.a = r.a AND r.a in [ source = inner | fields d ] | fields l.a, r.a, b, c //as join filter - -ExistsSubquery:: - - // Assumptions: `a`, `b` are fields of table outer, `c`, `d` are fields of table inner, `e`, `f` are fields of table nested - source = outer | where exists [ source = inner | where a = c ] - source = outer | where not exists [ source = inner | where a = c ] - source = outer | where exists [ source = inner | where a = c and b = d ] - source = outer | where not exists [ source = inner | where a = c and b = d ] - source = outer exists [ source = inner | where a = c ] // search filtering with subquery - source = outer not exists [ source = inner | where a = c ] //search filtering with subquery - source = table as t1 exists [ source = table as t2 | where t1.a = t2.a ] //table alias is useful in exists subquery - source = outer | where exists [ source = inner1 | where a = c and exists [ source = nested | where c = e ] ] //nested - source = outer | where exists [ source = inner1 | where a = c | where exists [ source = nested | where c = e ] ] //nested - source = outer | where exists [ source = inner | where c > 10 ] //uncorrelated exists - source = outer | where not exists [ source = inner | where c > 10 ] //uncorrelated exists - source = outer | where exists [ source = inner ] | eval l = "nonEmpty" | fields l //special uncorrelated exists - -ScalarSubquery:: - - //Uncorrelated scalar subquery in Select - source = outer | eval m = [ source = inner | stats max(c) ] | fields m, a - source = outer | eval m = [ source = inner | stats max(c) ] + b | fields m, a - - //Uncorrelated scalar subquery in Where** - source = outer | where a > [ source = inner | stats min(c) ] | fields a - - //Uncorrelated scalar subquery in Search filter - source = outer a > [ source = inner | stats min(c) ] | fields a - - //Correlated scalar subquery in Select - source = outer | eval m = [ source = inner | where outer.b = inner.d | stats max(c) ] | fields m, a - source = outer | eval m = [ source = inner | where b = d | stats max(c) ] | fields m, a - source = outer | eval m = [ source = inner | where outer.b > inner.d | stats max(c) ] | fields m, a - - //Correlated scalar subquery in Where - source = outer | where a = [ source = inner | where outer.b = inner.d | stats max(c) ] - source = outer | where a = [ source = inner | where b = d | stats max(c) ] - source = outer | where [ source = inner | where outer.b = inner.d OR inner.d = 1 | stats count() ] > 0 | fields a - - //Correlated scalar subquery in Search filter - source = outer a = [ source = inner | where b = d | stats max(c) ] - source = outer [ source = inner | where outer.b = inner.d OR inner.d = 1 | stats count() ] > 0 | fields a - - //Nested scalar subquery - source = outer | where a = [ source = inner | stats max(c) | sort c ] OR b = [ source = inner | where c = 1 | stats min(d) | sort d ] - source = outer | where a = [ source = inner | where c = [ source = nested | stats max(e) by f | sort f ] | stats max(d) by c | sort c | head 1 ] - -RelationSubquery:: - - source = table1 | join left = l right = r on condition [ source = table2 | where d > 10 | head 5 ] //subquery in join right side - source = [ source = table1 | join left = l right = r [ source = table2 | where d > 10 | head 5 ] | stats count(a) by b ] as outer | head 1 - -Example 1: TPC-H q20 -==================== - -This example shows a complex TPC-H query 20 implementation using nested subqueries. - -PPL query:: - - >> curl -H 'Content-Type: application/json' -X POST localhost:9200/_plugins/_ppl -d '{ - "query" : """ - source = supplier - | join ON s_nationkey = n_nationkey nation - | where n_name = 'CANADA' - and s_suppkey in [ - source = partsupp - | where ps_partkey in [ - source = part - | where like(p_name, 'forest%') - | fields p_partkey - ] - and ps_availqty > [ - source = lineitem - | where l_partkey = ps_partkey - and l_suppkey = ps_suppkey - and l_shipdate >= date('1994-01-01') - and l_shipdate < date_add(date('1994-01-01'), interval 1 year) - | stats sum(l_quantity) as sum_l_quantity - | eval half_sum_l_quantity = 0.5 * sum_l_quantity // Stats and Eval commands can combine when issues/819 resolved - | fields half_sum_l_quantity - ] - | fields ps_suppkey - ] - """ - }' - -Example 2: TPC-H q22 -==================== - -This example shows a TPC-H query 22 implementation using EXISTS and scalar subqueries. - -PPL query:: - - >> curl -H 'Content-Type: application/json' -X POST localhost:9200/_plugins/_ppl -d '{ - "query" : """ - source = [ - source = customer - | where substring(c_phone, 1, 2) in ('13', '31', '23', '29', '30', '18', '17') - and c_acctbal > [ - source = customer - | where c_acctbal > 0.00 - and substring(c_phone, 1, 2) in ('13', '31', '23', '29', '30', '18', '17') - | stats avg(c_acctbal) - ] - and not exists [ - source = orders - | where o_custkey = c_custkey - ] - | eval cntrycode = substring(c_phone, 1, 2) - | fields cntrycode, c_acctbal - ] as custsale - | stats count() as numcust, sum(c_acctbal) as totacctbal by cntrycode - | sort cntrycode - """ - }' - diff --git a/docs/user/ppl/cmd/syntax.md b/docs/user/ppl/cmd/syntax.md new file mode 100644 index 00000000000..32c5ebe89d9 --- /dev/null +++ b/docs/user/ppl/cmd/syntax.md @@ -0,0 +1,18 @@ +# Syntax + +## Command Order + +The PPL query starts with either the `search` command to reference a table to search from, or the `describe` command to reference a table to get its metadata. All the following command could be in any order. In the following example, `search` command refer the accounts index as the source, then using fields and where command to do the further processing. + +```text +search source=accounts +| where age > 18 +| fields firstname, lastname +``` + +## Required arguments + +Required arguments are shown in angle brackets < >. +## Optional arguments + +Optional arguments are enclosed in square brackets [ ]. \ No newline at end of file diff --git a/docs/user/ppl/cmd/syntax.rst b/docs/user/ppl/cmd/syntax.rst deleted file mode 100644 index c15aad68e15..00000000000 --- a/docs/user/ppl/cmd/syntax.rst +++ /dev/null @@ -1,30 +0,0 @@ -====== -Syntax -====== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - -Command Order -============= -The PPL query starts with either the ``search`` command to reference a table to search from, or the ``describe`` command to reference a table to get its metadata. All the following command could be in any order. In the following example, ``search`` command refer the accounts index as the source, then using fields and where command to do the further processing. - -.. code-block:: - - search source=accounts - | where age > 18 - | fields firstname, lastname - - -Required arguments -================== -Required arguments are shown in angle brackets < >. - - -Optional arguments -================== -Optional arguments are enclosed in square brackets [ ]. - diff --git a/docs/user/ppl/cmd/table.md b/docs/user/ppl/cmd/table.md new file mode 100644 index 00000000000..176752ebfba --- /dev/null +++ b/docs/user/ppl/cmd/table.md @@ -0,0 +1,37 @@ +# table + +## Description + +The `table` command is an alias for the [`fields`](fields.md) command and provides the same field selection capabilities. It allows you to keep or remove fields from the search result using enhanced syntax options. +## Syntax + +table [+\|-] \ +* [+\|-]: optional. If the plus (+) is used, only the fields specified in the field list will be kept. If the minus (-) is used, all the fields specified in the field list will be removed. **Default:** +. +* field-list: mandatory. Comma-delimited or space-delimited list of fields to keep or remove. Supports wildcard patterns. + +## Example 1: Basic table command usage + +This example shows basic field selection using the table command. + +```ppl +source=accounts +| table firstname lastname age +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------+----------+-----+ +| firstname | lastname | age | +|-----------+----------+-----| +| Amber | Duke | 32 | +| Hattie | Bond | 36 | +| Nanette | Bates | 28 | +| Dale | Adams | 33 | ++-----------+----------+-----+ +``` + +## See Also + +- [fields](fields.md) - Alias command with identical functionality \ No newline at end of file diff --git a/docs/user/ppl/cmd/table.rst b/docs/user/ppl/cmd/table.rst deleted file mode 100644 index 3512a648a1c..00000000000 --- a/docs/user/ppl/cmd/table.rst +++ /dev/null @@ -1,44 +0,0 @@ -===== -table -===== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -The ``table`` command is an alias for the `fields `_ command and provides the same field selection capabilities. It allows you to keep or remove fields from the search result using enhanced syntax options. - -Syntax -====== -table [+|-] - -* [+|-]: optional. If the plus (+) is used, only the fields specified in the field list will be kept. If the minus (-) is used, all the fields specified in the field list will be removed. **Default:** +. -* field-list: mandatory. Comma-delimited or space-delimited list of fields to keep or remove. Supports wildcard patterns. - -Example 1: Basic table command usage -==================================== - -This example shows basic field selection using the table command. - -PPL query:: - - os> source=accounts | table firstname lastname age; - fetched rows / total rows = 4/4 - +-----------+----------+-----+ - | firstname | lastname | age | - |-----------+----------+-----| - | Amber | Duke | 32 | - | Hattie | Bond | 36 | - | Nanette | Bates | 28 | - | Dale | Adams | 33 | - +-----------+----------+-----+ - - -See Also -======== -- `fields `_ - Alias command with identical functionality \ No newline at end of file diff --git a/docs/user/ppl/cmd/timechart.md b/docs/user/ppl/cmd/timechart.md new file mode 100644 index 00000000000..da3831c7aea --- /dev/null +++ b/docs/user/ppl/cmd/timechart.md @@ -0,0 +1,375 @@ +# timechart + +## Description + +The `timechart` command creates a time-based aggregation of data. It groups data by time intervals and optionally by a field, then applies an aggregation function to each group. The results are returned in an unpivoted format with separate rows for each time-field combination. +## Syntax + +timechart [timefield=\] [span=\] [limit=\] [useother=\] \ [by \] +* timefield: optional. Specifies the timestamp field to use for time interval grouping. **Default**: `@timestamp`. +* span: optional. Specifies the time interval for grouping data. **Default:** 1m (1 minute). + * Available time units: + * millisecond (ms) + * second (s) + * minute (m, case sensitive) + * hour (h) + * day (d) + * week (w) + * month (M, case sensitive) + * quarter (q) + * year (y) +* limit: optional. Specifies the maximum number of distinct values to display when using the "by" clause. **Default:** 10. + * When there are more distinct values than the limit, the additional values are grouped into an "OTHER" category if useother is not set to false. + * The "most distinct" values are determined by calculating the sum of the aggregation values across all time intervals for each distinct field value. The top N values with the highest sums are displayed individually, while the rest are grouped into the "OTHER" category. + * Set to 0 to show all distinct values without any limit (when limit=0, useother is automatically set to false). + * The parameters can be specified in any order before the aggregation function. + * Only applies when using the "by" clause to group results. +* useother: optional. Controls whether to create an "OTHER" category for values beyond the limit. **Default:** true. + * When set to false, only the top N values (based on limit) are shown without an "OTHER" column. + * When set to true, values beyond the limit are grouped into an "OTHER" category. + * Only applies when using the "by" clause and when there are more distinct values than the limit. +* usenull: optional. Controls whether NULL values are placed into a separate category in the chart. **Default:** true. + * When set to true, NULL values are grouped into a separate category with the label specified by nullstr. + * When set to false, NULL values are excluded from the results. +* nullstr: optional. The display label used for NULL values when usenull is true. **Default:** "NULL". + * Specifies the string representation for the NULL category in the chart output. +* aggregation_function: mandatory. The aggregation function to apply to each time bucket. + * Currently, only a single aggregation function is supported. + * Available functions: All aggregation functions supported by the [stats](stats.md) command, as well as the timechart-specific aggregations listed below. +* by: optional. Groups the results by the specified field in addition to time intervals. If not specified, the aggregation is performed across all documents in each time interval. + +## PER_SECOND + +Usage: per_second(field) calculates the per-second rate for a numeric field within each time bucket. +The calculation formula is: `per_second(field) = sum(field) / span_in_seconds`, where `span_in_seconds` is the span interval in seconds. +Return type: DOUBLE +## PER_MINUTE + +Usage: per_minute(field) calculates the per-minute rate for a numeric field within each time bucket. +The calculation formula is: `per_minute(field) = sum(field) * 60 / span_in_seconds`, where `span_in_seconds` is the span interval in seconds. +Return type: DOUBLE +## PER_HOUR + +Usage: per_hour(field) calculates the per-hour rate for a numeric field within each time bucket. +The calculation formula is: `per_hour(field) = sum(field) * 3600 / span_in_seconds`, where `span_in_seconds` is the span interval in seconds. +Return type: DOUBLE +## PER_DAY + +Usage: per_day(field) calculates the per-day rate for a numeric field within each time bucket. +The calculation formula is: `per_day(field) = sum(field) * 86400 / span_in_seconds`, where `span_in_seconds` is the span interval in seconds. +Return type: DOUBLE +## Notes + +* The `timechart` command requires a timestamp field in the data. By default, it uses the `@timestamp` field, but you can specify a different field using the `timefield` parameter. +* Results are returned in an unpivoted format with separate rows for each time-field combination that has data. +* Only combinations with actual data are included in the results - empty combinations are omitted rather than showing null or zero values. +* The "top N" values for the `limit` parameter are selected based on the sum of values across all time intervals for each distinct field value. +* When using the `limit` parameter, values beyond the limit are grouped into an "OTHER" category (unless `useother=false`). +* Examples 6 and 7 use different datasets: Example 6 uses the `events` dataset with fewer hosts for simplicity, while Example 7 uses the `events_many_hosts` dataset with 11 distinct hosts. +* **Null values**: Documents with null values in the "by" field are treated as a separate category and appear as null in the results. + +## Example 1: Count events by hour + +This example counts events for each hour and groups them by host. + +```ppl +source=events +| timechart span=1h count() by host +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++---------------------+---------+---------+ +| @timestamp | host | count() | +|---------------------+---------+---------| +| 2023-01-01 10:00:00 | server1 | 4 | +| 2023-01-01 10:00:00 | server2 | 4 | ++---------------------+---------+---------+ +``` + +## Example 2: Count events by minute + +This example counts events for each minute and groups them by host. + +```ppl +source=events +| timechart span=1m count() by host +``` + +Expected output: + +```text +fetched rows / total rows = 8/8 ++---------------------+---------+---------+ +| @timestamp | host | count() | +|---------------------+---------+---------| +| 2023-01-01 10:00:00 | server1 | 1 | +| 2023-01-01 10:05:00 | server2 | 1 | +| 2023-01-01 10:10:00 | server1 | 1 | +| 2023-01-01 10:15:00 | server2 | 1 | +| 2023-01-01 10:20:00 | server1 | 1 | +| 2023-01-01 10:25:00 | server2 | 1 | +| 2023-01-01 10:30:00 | server1 | 1 | +| 2023-01-01 10:35:00 | server2 | 1 | ++---------------------+---------+---------+ +``` + +## Example 3: Calculate average number of packets by minute + +This example calculates the average packets for each minute without grouping by any field. + +```ppl +source=events +| timechart span=1m avg(packets) +``` + +Expected output: + +```text +fetched rows / total rows = 8/8 ++---------------------+--------------+ +| @timestamp | avg(packets) | +|---------------------+--------------| +| 2023-01-01 10:00:00 | 60.0 | +| 2023-01-01 10:05:00 | 30.0 | +| 2023-01-01 10:10:00 | 60.0 | +| 2023-01-01 10:15:00 | 30.0 | +| 2023-01-01 10:20:00 | 60.0 | +| 2023-01-01 10:25:00 | 30.0 | +| 2023-01-01 10:30:00 | 180.0 | +| 2023-01-01 10:35:00 | 90.0 | ++---------------------+--------------+ +``` + +## Example 4: Calculate average number of packets by every 20 minutes and status + +This example calculates the average number of packets for every 20 minutes and groups them by status. + +```ppl +source=events +| timechart span=20m avg(packets) by status +``` + +Expected output: + +```text +fetched rows / total rows = 8/8 ++---------------------+------------+--------------+ +| @timestamp | status | avg(packets) | +|---------------------+------------+--------------| +| 2023-01-01 10:00:00 | active | 30.0 | +| 2023-01-01 10:00:00 | inactive | 30.0 | +| 2023-01-01 10:00:00 | pending | 60.0 | +| 2023-01-01 10:00:00 | processing | 60.0 | +| 2023-01-01 10:20:00 | cancelled | 180.0 | +| 2023-01-01 10:20:00 | completed | 60.0 | +| 2023-01-01 10:20:00 | inactive | 90.0 | +| 2023-01-01 10:20:00 | pending | 30.0 | ++---------------------+------------+--------------+ +``` + +## Example 5: Count events by hour and category + +This example counts events for each second and groups them by category + +```ppl +source=events +| timechart span=1h count() by category +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++---------------------+----------+---------+ +| @timestamp | category | count() | +|---------------------+----------+---------| +| 2023-01-01 10:00:00 | orders | 4 | +| 2023-01-01 10:00:00 | users | 4 | ++---------------------+----------+---------+ +``` + +## Example 6: Using the limit parameter with count() function + +When there are many distinct values in the "by" field, the timechart command will display the top values based on the limit parameter and group the rest into an "OTHER" category. +This query will display the top 2 hosts with the highest count values, and group the remaining hosts into an "OTHER" category. + +```ppl +source=events +| timechart span=1m limit=2 count() by host +``` + +Expected output: + +```text +fetched rows / total rows = 8/8 ++---------------------+---------+---------+ +| @timestamp | host | count() | +|---------------------+---------+---------| +| 2023-01-01 10:00:00 | server1 | 1 | +| 2023-01-01 10:05:00 | server2 | 1 | +| 2023-01-01 10:10:00 | server1 | 1 | +| 2023-01-01 10:15:00 | server2 | 1 | +| 2023-01-01 10:20:00 | server1 | 1 | +| 2023-01-01 10:25:00 | server2 | 1 | +| 2023-01-01 10:30:00 | server1 | 1 | +| 2023-01-01 10:35:00 | server2 | 1 | ++---------------------+---------+---------+ +``` + +## Example 7: Using limit=0 with count() to show all values + +To display all distinct values without any limit, set limit=0: + +```ppl +source=events_many_hosts +| timechart span=1h limit=0 count() by host +``` + +Expected output: + +```text +fetched rows / total rows = 11/11 ++---------------------+--------+---------+ +| @timestamp | host | count() | +|---------------------+--------+---------| +| 2024-07-01 00:00:00 | web-01 | 1 | +| 2024-07-01 00:00:00 | web-02 | 1 | +| 2024-07-01 00:00:00 | web-03 | 1 | +| 2024-07-01 00:00:00 | web-04 | 1 | +| 2024-07-01 00:00:00 | web-05 | 1 | +| 2024-07-01 00:00:00 | web-06 | 1 | +| 2024-07-01 00:00:00 | web-07 | 1 | +| 2024-07-01 00:00:00 | web-08 | 1 | +| 2024-07-01 00:00:00 | web-09 | 1 | +| 2024-07-01 00:00:00 | web-10 | 1 | +| 2024-07-01 00:00:00 | web-11 | 1 | ++---------------------+--------+---------+ +``` + +This shows all 11 hosts as separate rows without an "OTHER" category. +## Example 8: Using useother=false with count() function + +Limit to top 10 hosts without OTHER category (useother=false): + +```ppl +source=events_many_hosts +| timechart span=1h useother=false count() by host +``` + +Expected output: + +```text +fetched rows / total rows = 10/10 ++---------------------+--------+---------+ +| @timestamp | host | count() | +|---------------------+--------+---------| +| 2024-07-01 00:00:00 | web-01 | 1 | +| 2024-07-01 00:00:00 | web-02 | 1 | +| 2024-07-01 00:00:00 | web-03 | 1 | +| 2024-07-01 00:00:00 | web-04 | 1 | +| 2024-07-01 00:00:00 | web-05 | 1 | +| 2024-07-01 00:00:00 | web-06 | 1 | +| 2024-07-01 00:00:00 | web-07 | 1 | +| 2024-07-01 00:00:00 | web-08 | 1 | +| 2024-07-01 00:00:00 | web-09 | 1 | +| 2024-07-01 00:00:00 | web-10 | 1 | ++---------------------+--------+---------+ +``` + +## Example 9: Using limit with useother parameter and avg() function + +Limit to top 3 hosts with OTHER category (default useother=true): + +```ppl +source=events_many_hosts +| timechart span=1h limit=3 avg(cpu_usage) by host +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++---------------------+--------+----------------+ +| @timestamp | host | avg(cpu_usage) | +|---------------------+--------+----------------| +| 2024-07-01 00:00:00 | OTHER | 41.3 | +| 2024-07-01 00:00:00 | web-03 | 55.3 | +| 2024-07-01 00:00:00 | web-07 | 48.6 | +| 2024-07-01 00:00:00 | web-09 | 67.8 | ++---------------------+--------+----------------+ +``` + +Limit to top 3 hosts without OTHER category (useother=false): + +```ppl +source=events_many_hosts +| timechart span=1h limit=3 useother=false avg(cpu_usage) by host +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++---------------------+--------+----------------+ +| @timestamp | host | avg(cpu_usage) | +|---------------------+--------+----------------| +| 2024-07-01 00:00:00 | web-03 | 55.3 | +| 2024-07-01 00:00:00 | web-07 | 48.6 | +| 2024-07-01 00:00:00 | web-09 | 67.8 | ++---------------------+--------+----------------+ +``` + +## Example 10: Handling null values in the "by" field + +This example shows how null values in the "by" field are treated as a separate category. The dataset events_null has 1 entry that does not have a host field. +It is put into a separate "NULL" category because the defaults for `usenull` and `nullstr` are `true` and `"NULL"` respectively. + +```ppl +source=events_null +| timechart span=1h count() by host +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++---------------------+--------+---------+ +| @timestamp | host | count() | +|---------------------+--------+---------| +| 2024-07-01 00:00:00 | NULL | 1 | +| 2024-07-01 00:00:00 | db-01 | 1 | +| 2024-07-01 00:00:00 | web-01 | 2 | +| 2024-07-01 00:00:00 | web-02 | 2 | ++---------------------+--------+---------+ +``` + +## Example 11: Calculate packets per second rate + +This example calculates the per-second packet rate for network traffic data using the per_second() function. + +```ppl +source=events +| timechart span=30m per_second(packets) by host +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++---------------------+---------+---------------------+ +| @timestamp | host | per_second(packets) | +|---------------------+---------+---------------------| +| 2023-01-01 10:00:00 | server1 | 0.1 | +| 2023-01-01 10:00:00 | server2 | 0.05 | +| 2023-01-01 10:30:00 | server1 | 0.1 | +| 2023-01-01 10:30:00 | server2 | 0.05 | ++---------------------+---------+---------------------+ +``` + +## Limitations + +* Only a single aggregation function is supported per timechart command. +* The `bins` parameter and other bin options are not supported since the `bin` command is not implemented yet. Use the `span` parameter to control time intervals. \ No newline at end of file diff --git a/docs/user/ppl/cmd/timechart.rst b/docs/user/ppl/cmd/timechart.rst deleted file mode 100644 index 21ac980d46a..00000000000 --- a/docs/user/ppl/cmd/timechart.rst +++ /dev/null @@ -1,351 +0,0 @@ -========= -timechart -========= - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| The ``timechart`` command creates a time-based aggregation of data. It groups data by time intervals and optionally by a field, then applies an aggregation function to each group. The results are returned in an unpivoted format with separate rows for each time-field combination. - -Syntax -====== - -timechart [timefield=] [span=] [limit=] [useother=] [by ] - -* timefield: optional. Specifies the timestamp field to use for time interval grouping. **Default**: ``@timestamp``. - -* span: optional. Specifies the time interval for grouping data. **Default:** 1m (1 minute). - - * Available time units: - - * millisecond (ms) - * second (s) - * minute (m, case sensitive) - * hour (h) - * day (d) - * week (w) - * month (M, case sensitive) - * quarter (q) - * year (y) - -* limit: optional. Specifies the maximum number of distinct values to display when using the "by" clause. **Default:** 10. - - * When there are more distinct values than the limit, the additional values are grouped into an "OTHER" category if useother is not set to false. - * The "most distinct" values are determined by calculating the sum of the aggregation values across all time intervals for each distinct field value. The top N values with the highest sums are displayed individually, while the rest are grouped into the "OTHER" category. - * Set to 0 to show all distinct values without any limit (when limit=0, useother is automatically set to false). - * The parameters can be specified in any order before the aggregation function. - * Only applies when using the "by" clause to group results. - -* useother: optional. Controls whether to create an "OTHER" category for values beyond the limit. **Default:** true. - - * When set to false, only the top N values (based on limit) are shown without an "OTHER" column. - * When set to true, values beyond the limit are grouped into an "OTHER" category. - * Only applies when using the "by" clause and when there are more distinct values than the limit. - -* aggregation_function: mandatory. The aggregation function to apply to each time bucket. - - * Currently, only a single aggregation function is supported. - * Available functions: All aggregation functions supported by the :doc:`stats ` command, as well as the timechart-specific aggregations listed below. - -* by: optional. Groups the results by the specified field in addition to time intervals. If not specified, the aggregation is performed across all documents in each time interval. - -PER_SECOND ----------- - -Usage: per_second(field) calculates the per-second rate for a numeric field within each time bucket. - -The calculation formula is: `per_second(field) = sum(field) / span_in_seconds`, where `span_in_seconds` is the span interval in seconds. - -Return type: DOUBLE - -PER_MINUTE ----------- - -Usage: per_minute(field) calculates the per-minute rate for a numeric field within each time bucket. - -The calculation formula is: `per_minute(field) = sum(field) * 60 / span_in_seconds`, where `span_in_seconds` is the span interval in seconds. - -Return type: DOUBLE - -PER_HOUR --------- - -Usage: per_hour(field) calculates the per-hour rate for a numeric field within each time bucket. - -The calculation formula is: `per_hour(field) = sum(field) * 3600 / span_in_seconds`, where `span_in_seconds` is the span interval in seconds. - -Return type: DOUBLE - -PER_DAY -------- - -Usage: per_day(field) calculates the per-day rate for a numeric field within each time bucket. - -The calculation formula is: `per_day(field) = sum(field) * 86400 / span_in_seconds`, where `span_in_seconds` is the span interval in seconds. - -Return type: DOUBLE - -Notes -===== - -* The ``timechart`` command requires a timestamp field in the data. By default, it uses the ``@timestamp`` field, but you can specify a different field using the ``timefield`` parameter. -* Results are returned in an unpivoted format with separate rows for each time-field combination that has data. -* Only combinations with actual data are included in the results - empty combinations are omitted rather than showing null or zero values. -* The "top N" values for the ``limit`` parameter are selected based on the sum of values across all time intervals for each distinct field value. -* When using the ``limit`` parameter, values beyond the limit are grouped into an "OTHER" category (unless ``useother=false``). -* Examples 6 and 7 use different datasets: Example 6 uses the ``events`` dataset with fewer hosts for simplicity, while Example 7 uses the ``events_many_hosts`` dataset with 11 distinct hosts. - -* **Null values**: Documents with null values in the "by" field are treated as a separate category and appear as null in the results. - -Example 1: Count events by hour -=============================== - -This example counts events for each hour and groups them by host. - -PPL query:: - - os> source=events | timechart span=1h count() by host - fetched rows / total rows = 2/2 - +---------------------+---------+---------+ - | @timestamp | host | count() | - |---------------------+---------+---------| - | 2023-01-01 10:00:00 | server1 | 4 | - | 2023-01-01 10:00:00 | server2 | 4 | - +---------------------+---------+---------+ - -Example 2: Count events by minute -========================================================== - -This example counts events for each minute and groups them by host. - -PPL query:: - - os> source=events | timechart span=1m count() by host - fetched rows / total rows = 8/8 - +---------------------+---------+---------+ - | @timestamp | host | count() | - |---------------------+---------+---------| - | 2023-01-01 10:00:00 | server1 | 1 | - | 2023-01-01 10:05:00 | server2 | 1 | - | 2023-01-01 10:10:00 | server1 | 1 | - | 2023-01-01 10:15:00 | server2 | 1 | - | 2023-01-01 10:20:00 | server1 | 1 | - | 2023-01-01 10:25:00 | server2 | 1 | - | 2023-01-01 10:30:00 | server1 | 1 | - | 2023-01-01 10:35:00 | server2 | 1 | - +---------------------+---------+---------+ - -Example 3: Calculate average number of packets by minute -================================================ - -This example calculates the average packets for each minute without grouping by any field. - -PPL query:: - - os> source=events | timechart span=1m avg(packets) - fetched rows / total rows = 8/8 - +---------------------+--------------+ - | @timestamp | avg(packets) | - |---------------------+--------------| - | 2023-01-01 10:00:00 | 60.0 | - | 2023-01-01 10:05:00 | 30.0 | - | 2023-01-01 10:10:00 | 60.0 | - | 2023-01-01 10:15:00 | 30.0 | - | 2023-01-01 10:20:00 | 60.0 | - | 2023-01-01 10:25:00 | 30.0 | - | 2023-01-01 10:30:00 | 180.0 | - | 2023-01-01 10:35:00 | 90.0 | - +---------------------+--------------+ - -Example 4: Calculate average number of packets by every 20 minutes and status -=========================================================== - -This example calculates the average number of packets for every 20 minutes and groups them by status. - -PPL query:: - - os> source=events | timechart span=20m avg(packets) by status - fetched rows / total rows = 8/8 - +---------------------+------------+--------------+ - | @timestamp | status | avg(packets) | - |---------------------+------------+--------------| - | 2023-01-01 10:00:00 | active | 30.0 | - | 2023-01-01 10:00:00 | inactive | 30.0 | - | 2023-01-01 10:00:00 | pending | 60.0 | - | 2023-01-01 10:00:00 | processing | 60.0 | - | 2023-01-01 10:20:00 | cancelled | 180.0 | - | 2023-01-01 10:20:00 | completed | 60.0 | - | 2023-01-01 10:20:00 | inactive | 90.0 | - | 2023-01-01 10:20:00 | pending | 30.0 | - +---------------------+------------+--------------+ - -Example 5: Count events by hour and category -===================================================================== - -This example counts events for each second and groups them by category - -PPL query:: - - os> source=events | timechart span=1h count() by category - fetched rows / total rows = 2/2 - +---------------------+----------+---------+ - | @timestamp | category | count() | - |---------------------+----------+---------| - | 2023-01-01 10:00:00 | orders | 4 | - | 2023-01-01 10:00:00 | users | 4 | - +---------------------+----------+---------+ - -Example 6: Using the limit parameter with count() function -========================================================== - -When there are many distinct values in the "by" field, the timechart command will display the top values based on the limit parameter and group the rest into an "OTHER" category. -This query will display the top 2 hosts with the highest count values, and group the remaining hosts into an "OTHER" category. - -PPL query:: - - os> source=events | timechart span=1m limit=2 count() by host - fetched rows / total rows = 8/8 - +---------------------+---------+---------+ - | @timestamp | host | count() | - |---------------------+---------+---------| - | 2023-01-01 10:00:00 | server1 | 1 | - | 2023-01-01 10:05:00 | server2 | 1 | - | 2023-01-01 10:10:00 | server1 | 1 | - | 2023-01-01 10:15:00 | server2 | 1 | - | 2023-01-01 10:20:00 | server1 | 1 | - | 2023-01-01 10:25:00 | server2 | 1 | - | 2023-01-01 10:30:00 | server1 | 1 | - | 2023-01-01 10:35:00 | server2 | 1 | - +---------------------+---------+---------+ - -Example 7: Using limit=0 with count() to show all values -======================================================== - -To display all distinct values without any limit, set limit=0: - -PPL query:: - - os> source=events_many_hosts | timechart span=1h limit=0 count() by host - fetched rows / total rows = 11/11 - +---------------------+--------+---------+ - | @timestamp | host | count() | - |---------------------+--------+---------| - | 2024-07-01 00:00:00 | web-01 | 1 | - | 2024-07-01 00:00:00 | web-02 | 1 | - | 2024-07-01 00:00:00 | web-03 | 1 | - | 2024-07-01 00:00:00 | web-04 | 1 | - | 2024-07-01 00:00:00 | web-05 | 1 | - | 2024-07-01 00:00:00 | web-06 | 1 | - | 2024-07-01 00:00:00 | web-07 | 1 | - | 2024-07-01 00:00:00 | web-08 | 1 | - | 2024-07-01 00:00:00 | web-09 | 1 | - | 2024-07-01 00:00:00 | web-10 | 1 | - | 2024-07-01 00:00:00 | web-11 | 1 | - +---------------------+--------+---------+ - -This shows all 11 hosts as separate rows without an "OTHER" category. - -Example 8: Using useother=false with count() function -===================================================== - -Limit to top 10 hosts without OTHER category (useother=false): - -PPL query:: - - os> source=events_many_hosts | timechart span=1h useother=false count() by host - fetched rows / total rows = 10/10 - +---------------------+--------+---------+ - | @timestamp | host | count() | - |---------------------+--------+---------| - | 2024-07-01 00:00:00 | web-01 | 1 | - | 2024-07-01 00:00:00 | web-02 | 1 | - | 2024-07-01 00:00:00 | web-03 | 1 | - | 2024-07-01 00:00:00 | web-04 | 1 | - | 2024-07-01 00:00:00 | web-05 | 1 | - | 2024-07-01 00:00:00 | web-06 | 1 | - | 2024-07-01 00:00:00 | web-07 | 1 | - | 2024-07-01 00:00:00 | web-08 | 1 | - | 2024-07-01 00:00:00 | web-09 | 1 | - | 2024-07-01 00:00:00 | web-10 | 1 | - +---------------------+--------+---------+ - -Example 9: Using limit with useother parameter and avg() function -================================================================= - -Limit to top 3 hosts with OTHER category (default useother=true): - -PPL query:: - - os> source=events_many_hosts | timechart span=1h limit=3 avg(cpu_usage) by host - fetched rows / total rows = 4/4 - +---------------------+--------+----------------+ - | @timestamp | host | avg(cpu_usage) | - |---------------------+--------+----------------| - | 2024-07-01 00:00:00 | OTHER | 41.3 | - | 2024-07-01 00:00:00 | web-03 | 55.3 | - | 2024-07-01 00:00:00 | web-07 | 48.6 | - | 2024-07-01 00:00:00 | web-09 | 67.8 | - +---------------------+--------+----------------+ - -Limit to top 3 hosts without OTHER category (useother=false): - -PPL query:: - - os> source=events_many_hosts | timechart span=1h limit=3 useother=false avg(cpu_usage) by host - fetched rows / total rows = 3/3 - +---------------------+--------+----------------+ - | @timestamp | host | avg(cpu_usage) | - |---------------------+--------+----------------| - | 2024-07-01 00:00:00 | web-03 | 55.3 | - | 2024-07-01 00:00:00 | web-07 | 48.6 | - | 2024-07-01 00:00:00 | web-09 | 67.8 | - +---------------------+--------+----------------+ - -Example 10: Handling null values in the "by" field -================================================== - -This example shows how null values in the "by" field are treated as a separate category. The dataset events_null has 1 entry that does not have a host field. -It is put into a separate "NULL" category because the defaults for ``usenull`` and ``nullstr`` are ``true`` and ``"NULL"`` respectively. - -PPL query:: - - os> source=events_null | timechart span=1h count() by host - fetched rows / total rows = 4/4 - +---------------------+--------+---------+ - | @timestamp | host | count() | - |---------------------+--------+---------| - | 2024-07-01 00:00:00 | NULL | 1 | - | 2024-07-01 00:00:00 | db-01 | 1 | - | 2024-07-01 00:00:00 | web-01 | 2 | - | 2024-07-01 00:00:00 | web-02 | 2 | - +---------------------+--------+---------+ - -Example 11: Calculate packets per second rate -============================================= - -This example calculates the per-second packet rate for network traffic data using the per_second() function. - -PPL query:: - - os> source=events | timechart span=30m per_second(packets) by host - fetched rows / total rows = 4/4 - +---------------------+---------+---------------------+ - | @timestamp | host | per_second(packets) | - |---------------------+---------+---------------------| - | 2023-01-01 10:00:00 | server1 | 0.1 | - | 2023-01-01 10:00:00 | server2 | 0.05 | - | 2023-01-01 10:30:00 | server1 | 0.1 | - | 2023-01-01 10:30:00 | server2 | 0.05 | - +---------------------+---------+---------------------+ - -Limitations -=========== -* Only a single aggregation function is supported per timechart command. -* The ``bins`` parameter and other bin options are not supported since the ``bin`` command is not implemented yet. Use the ``span`` parameter to control time intervals. - diff --git a/docs/user/ppl/cmd/top.md b/docs/user/ppl/cmd/top.md new file mode 100644 index 00000000000..fa644f2a117 --- /dev/null +++ b/docs/user/ppl/cmd/top.md @@ -0,0 +1,164 @@ +# top + +## Description + +The `top` command finds the most common tuple of values of all fields in the field list. +## Syntax + +top [N] [top-options] \ [by-clause] +* N: optional. number of results to return. **Default**: 10 +* top-options: optional. options for the top command. Supported syntax is [countfield=\] [showcount=\]. + * showcount=\: optional. whether to create a field in output that represent a count of the tuple of values. **Default:** true. + * countfield=\: optional. the name of the field that contains count. **Default:** 'count'. + * usenull=\: optional (since 3.4.0). whether to output the null value. **Default:** Determined by `plugins.ppl.syntax.legacy.preferred`. + * When `plugins.ppl.syntax.legacy.preferred=true`, `usenull` defaults to `true` + * When `plugins.ppl.syntax.legacy.preferred=false`, `usenull` defaults to `false` +* field-list: mandatory. comma-delimited list of field names. +* by-clause: optional. one or more fields to group the results by. + +## Example 1: Find the most common values in a field + +This example finds the most common gender of all the accounts. + +```ppl +source=accounts +| top showcount=false gender +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++--------+ +| gender | +|--------| +| M | +| F | ++--------+ +``` + +## Example 2: Limit results to top N values + +This example finds the most common gender and limits results to 1 value. + +```ppl +source=accounts +| top 1 showcount=false gender +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------+ +| gender | +|--------| +| M | ++--------+ +``` + +## Example 3: Find the most common values grouped by field + +This example finds the most common age of all the accounts grouped by gender. + +```ppl +source=accounts +| top 1 showcount=false age by gender +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++--------+-----+ +| gender | age | +|--------+-----| +| F | 28 | +| M | 32 | ++--------+-----+ +``` + +## Example 4: Top command with count field + +This example finds the most common gender of all the accounts and includes the count. + +```ppl +source=accounts +| top gender +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++--------+-------+ +| gender | count | +|--------+-------| +| M | 3 | +| F | 1 | ++--------+-------+ +``` + +## Example 5: Specify the count field option + +This example specifies a custom name for the count field. + +```ppl +source=accounts +| top countfield='cnt' gender +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++--------+-----+ +| gender | cnt | +|--------+-----| +| M | 3 | +| F | 1 | ++--------+-----+ +``` + +## Example 5: Specify the usenull field option + +```ppl +source=accounts +| top usenull=false email +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++-----------------------+-------+ +| email | count | +|-----------------------+-------| +| amberduke@pyrami.com | 1 | +| daleadams@boink.com | 1 | +| hattiebond@netagy.com | 1 | ++-----------------------+-------+ +``` + +```ppl +source=accounts +| top usenull=true email +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------------------+-------+ +| email | count | +|-----------------------+-------| +| null | 1 | +| amberduke@pyrami.com | 1 | +| daleadams@boink.com | 1 | +| hattiebond@netagy.com | 1 | ++-----------------------+-------+ +``` + +## Limitations + +The `top` command is not rewritten to OpenSearch DSL, it is only executed on the coordination node. \ No newline at end of file diff --git a/docs/user/ppl/cmd/top.rst b/docs/user/ppl/cmd/top.rst deleted file mode 100644 index bdf22addf40..00000000000 --- a/docs/user/ppl/cmd/top.rst +++ /dev/null @@ -1,145 +0,0 @@ -=== -top -=== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| The ``top`` command finds the most common tuple of values of all fields in the field list. - -Syntax -====== -top [N] [top-options] [by-clause] - -* N: optional. number of results to return. **Default**: 10 -* top-options: optional. options for the top command. Supported syntax is [countfield=] [showcount=]. - - * showcount=: optional. whether to create a field in output that represent a count of the tuple of values. **Default:** true. - * countfield=: optional. the name of the field that contains count. **Default:** 'count'. - * usenull=: optional (since 3.4.0). whether to output the null value. **Default:** Determined by ``plugins.ppl.syntax.legacy.preferred``. - - * When ``plugins.ppl.syntax.legacy.preferred=true``, ``usenull`` defaults to ``true`` - * When ``plugins.ppl.syntax.legacy.preferred=false``, ``usenull`` defaults to ``false`` - -* field-list: mandatory. comma-delimited list of field names. -* by-clause: optional. one or more fields to group the results by. - -Example 1: Find the most common values in a field -================================================= - -This example finds the most common gender of all the accounts. - -PPL query:: - - os> source=accounts | top showcount=false gender; - fetched rows / total rows = 2/2 - +--------+ - | gender | - |--------| - | M | - | F | - +--------+ - -Example 2: Limit results to top N values -======================================== - -This example finds the most common gender and limits results to 1 value. - -PPL query:: - - os> source=accounts | top 1 showcount=false gender; - fetched rows / total rows = 1/1 - +--------+ - | gender | - |--------| - | M | - +--------+ - -Example 3: Find the most common values grouped by field -======================================================= - -This example finds the most common age of all the accounts grouped by gender. - -PPL query:: - - os> source=accounts | top 1 showcount=false age by gender; - fetched rows / total rows = 2/2 - +--------+-----+ - | gender | age | - |--------+-----| - | F | 28 | - | M | 32 | - +--------+-----+ - -Example 4: Top command with count field -======================================= - -This example finds the most common gender of all the accounts and includes the count. - -PPL query:: - - os> source=accounts | top gender; - fetched rows / total rows = 2/2 - +--------+-------+ - | gender | count | - |--------+-------| - | M | 3 | - | F | 1 | - +--------+-------+ - - -Example 5: Specify the count field option -========================================= - -This example specifies a custom name for the count field. - -PPL query:: - - os> source=accounts | top countfield='cnt' gender; - fetched rows / total rows = 2/2 - +--------+-----+ - | gender | cnt | - |--------+-----| - | M | 3 | - | F | 1 | - +--------+-----+ - - -Example 5: Specify the usenull field option -=========================================== - -PPL query:: - - os> source=accounts | top usenull=false email; - fetched rows / total rows = 3/3 - +-----------------------+-------+ - | email | count | - |-----------------------+-------| - | amberduke@pyrami.com | 1 | - | daleadams@boink.com | 1 | - | hattiebond@netagy.com | 1 | - +-----------------------+-------+ - -PPL query:: - - os> source=accounts | top usenull=true email; - fetched rows / total rows = 4/4 - +-----------------------+-------+ - | email | count | - |-----------------------+-------| - | null | 1 | - | amberduke@pyrami.com | 1 | - | daleadams@boink.com | 1 | - | hattiebond@netagy.com | 1 | - +-----------------------+-------+ - - -Limitations -=========== -The ``top`` command is not rewritten to OpenSearch DSL, it is only executed on the coordination node. diff --git a/docs/user/ppl/cmd/trendline.md b/docs/user/ppl/cmd/trendline.md new file mode 100644 index 00000000000..8fee5d3c399 --- /dev/null +++ b/docs/user/ppl/cmd/trendline.md @@ -0,0 +1,114 @@ +# trendline + +## Description + +The `trendline` command calculates moving averages of fields. +## Syntax + +trendline [sort <[+\|-] sort-field>] [sma\|wma](number-of-datapoints, field) [as \] [[sma\|wma](number-of-datapoints, field) [as \]]... +* [+\|-]: optional. The plus [+] stands for ascending order and NULL/MISSING first and a minus [-] stands for descending order and NULL/MISSING last. **Default:** ascending order and NULL/MISSING first. +* sort-field: mandatory when sorting is used. The field used to sort. +* sma\|wma: mandatory. Simple Moving Average (sma) applies equal weighting to all values, Weighted Moving Average (wma) applies greater weight to more recent values. +* number-of-datapoints: mandatory. The number of datapoints to calculate the moving average (must be greater than zero). +* field: mandatory. The name of the field the moving average should be calculated for. +* alias: optional. The name of the resulting column containing the moving average. **Default:** field name with "_trendline". + +## Example 1: Calculate the simple moving average on one field. + +This example shows how to calculate the simple moving average on one field. + +```ppl +source=accounts +| trendline sma(2, account_number) as an +| fields an +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++------+ +| an | +|------| +| null | +| 3.5 | +| 9.5 | +| 15.5 | ++------+ +``` + +## Example 2: Calculate the simple moving average on multiple fields. + +This example shows how to calculate the simple moving average on multiple fields. + +```ppl +source=accounts +| trendline sma(2, account_number) as an sma(2, age) as age_trend +| fields an, age_trend +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++------+-----------+ +| an | age_trend | +|------+-----------| +| null | null | +| 3.5 | 34.0 | +| 9.5 | 32.0 | +| 15.5 | 30.5 | ++------+-----------+ +``` + +## Example 3: Calculate the simple moving average on one field without specifying an alias. + +This example shows how to calculate the simple moving average on one field. + +```ppl +source=accounts +| trendline sma(2, account_number) +| fields account_number_trendline +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++--------------------------+ +| account_number_trendline | +|--------------------------| +| null | +| 3.5 | +| 9.5 | +| 15.5 | ++--------------------------+ +``` + +## Example 4: Calculate the weighted moving average on one field. + +This example shows how to calculate the weighted moving average on one field. + +```ppl +source=accounts +| trendline wma(2, account_number) +| fields account_number_trendline +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++--------------------------+ +| account_number_trendline | +|--------------------------| +| null | +| 4.333333333333333 | +| 10.666666666666666 | +| 16.333333333333332 | ++--------------------------+ +``` + +## Limitations + +The `trendline` command requires all values in the specified `field` to be non-null. Any rows with null values present in the calculation field will be automatically excluded from the command's output. \ No newline at end of file diff --git a/docs/user/ppl/cmd/trendline.rst b/docs/user/ppl/cmd/trendline.rst deleted file mode 100644 index e2fd067d262..00000000000 --- a/docs/user/ppl/cmd/trendline.rst +++ /dev/null @@ -1,103 +0,0 @@ -========= -trendline -========= - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| The ``trendline`` command calculates moving averages of fields. - -Syntax -====== -trendline [sort <[+|-] sort-field>] [sma|wma](number-of-datapoints, field) [as ] [[sma|wma](number-of-datapoints, field) [as ]]... - -* [+|-]: optional. The plus [+] stands for ascending order and NULL/MISSING first and a minus [-] stands for descending order and NULL/MISSING last. **Default:** ascending order and NULL/MISSING first. -* sort-field: mandatory when sorting is used. The field used to sort. -* sma|wma: mandatory. Simple Moving Average (sma) applies equal weighting to all values, Weighted Moving Average (wma) applies greater weight to more recent values. -* number-of-datapoints: mandatory. The number of datapoints to calculate the moving average (must be greater than zero). -* field: mandatory. The name of the field the moving average should be calculated for. -* alias: optional. The name of the resulting column containing the moving average. **Default:** field name with "_trendline". - - -Example 1: Calculate the simple moving average on one field. -============================================================ - -This example shows how to calculate the simple moving average on one field. - -PPL query:: - - os> source=accounts | trendline sma(2, account_number) as an | fields an; - fetched rows / total rows = 4/4 - +------+ - | an | - |------| - | null | - | 3.5 | - | 9.5 | - | 15.5 | - +------+ - - -Example 2: Calculate the simple moving average on multiple fields. -================================================================== - -This example shows how to calculate the simple moving average on multiple fields. - -PPL query:: - - os> source=accounts | trendline sma(2, account_number) as an sma(2, age) as age_trend | fields an, age_trend ; - fetched rows / total rows = 4/4 - +------+-----------+ - | an | age_trend | - |------+-----------| - | null | null | - | 3.5 | 34.0 | - | 9.5 | 32.0 | - | 15.5 | 30.5 | - +------+-----------+ - -Example 3: Calculate the simple moving average on one field without specifying an alias. -======================================================================================== - -This example shows how to calculate the simple moving average on one field. - -PPL query:: - - os> source=accounts | trendline sma(2, account_number) | fields account_number_trendline; - fetched rows / total rows = 4/4 - +--------------------------+ - | account_number_trendline | - |--------------------------| - | null | - | 3.5 | - | 9.5 | - | 15.5 | - +--------------------------+ - -Example 4: Calculate the weighted moving average on one field. -============================================================== - -This example shows how to calculate the weighted moving average on one field. - -PPL query:: - - PPL> source=accounts | trendline wma(2, account_number) | fields account_number_trendline; - fetched rows / total rows = 4/4 - +--------------------------+ - | account_number_trendline | - |--------------------------| - | null | - | 4.333333333333333 | - | 10.666666666666666 | - | 16.333333333333332 | - +--------------------------+ - -Limitations -=========== -The ``trendline`` command requires all values in the specified ``field`` to be non-null. Any rows with null values present in the calculation field will be automatically excluded from the command's output. \ No newline at end of file diff --git a/docs/user/ppl/cmd/where.md b/docs/user/ppl/cmd/where.md new file mode 100644 index 00000000000..9a96d9b7d47 --- /dev/null +++ b/docs/user/ppl/cmd/where.md @@ -0,0 +1,207 @@ +# where + +## Description + +The `where` command filters the search result. The `where` command only returns the result when the bool-expression evaluates to true. +## Syntax + +where \ +* bool-expression: optional. Any expression which could be evaluated to boolean value. + +## Example 1: Filter result set with condition + +This example shows fetching all the documents from the accounts index where account_number is 1 or gender is "F". + +```ppl +source=accounts +| where account_number=1 or gender="F" +| fields account_number, gender +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++----------------+--------+ +| account_number | gender | +|----------------+--------| +| 1 | M | +| 13 | F | ++----------------+--------+ +``` + +## Example 2: Basic Field Comparison + +The example shows how to filter accounts with balance greater than 30000. + +```ppl +source=accounts +| where balance > 30000 +| fields account_number, balance +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++----------------+---------+ +| account_number | balance | +|----------------+---------| +| 1 | 39225 | +| 13 | 32838 | ++----------------+---------+ +``` + +## Example 3: Pattern Matching with LIKE + +Pattern Matching with Underscore (_) +The example demonstrates using LIKE with underscore (_) to match a single character. + +```ppl +source=accounts +| where LIKE(state, 'M_') +| fields account_number, state +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------+-------+ +| account_number | state | +|----------------+-------| +| 18 | MD | ++----------------+-------+ +``` + +Pattern Matching with Percent (%) +The example demonstrates using LIKE with percent (%) to match multiple characters. + +```ppl +source=accounts +| where LIKE(state, 'V%') +| fields account_number, state +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------+-------+ +| account_number | state | +|----------------+-------| +| 13 | VA | ++----------------+-------+ +``` + +## Example 4: Multiple Conditions + +The example shows how to combine multiple conditions using AND operator. + +```ppl +source=accounts +| where age > 30 AND gender = 'M' +| fields account_number, age, gender +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++----------------+-----+--------+ +| account_number | age | gender | +|----------------+-----+--------| +| 1 | 32 | M | +| 6 | 36 | M | +| 18 | 33 | M | ++----------------+-----+--------+ +``` + +## Example 5: Using IN Operator + +The example demonstrates using IN operator to match multiple values. + +```ppl +source=accounts +| where state IN ('IL', 'VA') +| fields account_number, state +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++----------------+-------+ +| account_number | state | +|----------------+-------| +| 1 | IL | +| 13 | VA | ++----------------+-------+ +``` + +## Example 6: NULL Checks + +The example shows how to filter records with NULL values. + +```ppl +source=accounts +| where ISNULL(employer) +| fields account_number, employer +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------+----------+ +| account_number | employer | +|----------------+----------| +| 18 | null | ++----------------+----------+ +``` + +## Example 7: Complex Conditions + +The example demonstrates combining multiple conditions with parentheses and logical operators. + +```ppl +source=accounts +| where (balance > 40000 OR age > 35) AND gender = 'M' +| fields account_number, balance, age, gender +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------+---------+-----+--------+ +| account_number | balance | age | gender | +|----------------+---------+-----+--------| +| 6 | 5686 | 36 | M | ++----------------+---------+-----+--------+ +``` + +## Example 8: NOT Conditions + +The example shows how to use NOT operator to exclude matching records. + +```ppl +source=accounts +| where NOT state = 'CA' +| fields account_number, state +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+-------+ +| account_number | state | +|----------------+-------| +| 1 | IL | +| 6 | TN | +| 13 | VA | +| 18 | MD | ++----------------+-------+ +``` + \ No newline at end of file diff --git a/docs/user/ppl/cmd/where.rst b/docs/user/ppl/cmd/where.rst deleted file mode 100644 index 324af4dcb54..00000000000 --- a/docs/user/ppl/cmd/where.rst +++ /dev/null @@ -1,165 +0,0 @@ -===== -where -===== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -=========== -| The ``where`` command filters the search result. The ``where`` command only returns the result when the bool-expression evaluates to true. - -Syntax -====== -where - -* bool-expression: optional. Any expression which could be evaluated to boolean value. - -Example 1: Filter result set with condition -=========================================== - -This example shows fetching all the documents from the accounts index where account_number is 1 or gender is "F". - -PPL query:: - - os> source=accounts | where account_number=1 or gender="F" | fields account_number, gender; - fetched rows / total rows = 2/2 - +----------------+--------+ - | account_number | gender | - |----------------+--------| - | 1 | M | - | 13 | F | - +----------------+--------+ - -Example 2: Basic Field Comparison -================================= - -The example shows how to filter accounts with balance greater than 30000. - -PPL query:: - - os> source=accounts | where balance > 30000 | fields account_number, balance; - fetched rows / total rows = 2/2 - +----------------+---------+ - | account_number | balance | - |----------------+---------| - | 1 | 39225 | - | 13 | 32838 | - +----------------+---------+ - -Example 3: Pattern Matching with LIKE -===================================== - -Pattern Matching with Underscore (_) - -The example demonstrates using LIKE with underscore (_) to match a single character. - -PPL query:: - - os> source=accounts | where LIKE(state, 'M_') | fields account_number, state; - fetched rows / total rows = 1/1 - +----------------+-------+ - | account_number | state | - |----------------+-------| - | 18 | MD | - +----------------+-------+ - -Pattern Matching with Percent (%) - -The example demonstrates using LIKE with percent (%) to match multiple characters. - -PPL query:: - - os> source=accounts | where LIKE(state, 'V%') | fields account_number, state; - fetched rows / total rows = 1/1 - +----------------+-------+ - | account_number | state | - |----------------+-------| - | 13 | VA | - +----------------+-------+ - -Example 4: Multiple Conditions -============================== - -The example shows how to combine multiple conditions using AND operator. - -PPL query:: - - os> source=accounts | where age > 30 AND gender = 'M' | fields account_number, age, gender; - fetched rows / total rows = 3/3 - +----------------+-----+--------+ - | account_number | age | gender | - |----------------+-----+--------| - | 1 | 32 | M | - | 6 | 36 | M | - | 18 | 33 | M | - +----------------+-----+--------+ - -Example 5: Using IN Operator -============================ - -The example demonstrates using IN operator to match multiple values. - -PPL query:: - - os> source=accounts | where state IN ('IL', 'VA') | fields account_number, state; - fetched rows / total rows = 2/2 - +----------------+-------+ - | account_number | state | - |----------------+-------| - | 1 | IL | - | 13 | VA | - +----------------+-------+ - -Example 6: NULL Checks -====================== - -The example shows how to filter records with NULL values. - -PPL query:: - - os> source=accounts | where ISNULL(employer) | fields account_number, employer; - fetched rows / total rows = 1/1 - +----------------+----------+ - | account_number | employer | - |----------------+----------| - | 18 | null | - +----------------+----------+ - -Example 7: Complex Conditions -============================= - -The example demonstrates combining multiple conditions with parentheses and logical operators. - -PPL query:: - - os> source=accounts | where (balance > 40000 OR age > 35) AND gender = 'M' | fields account_number, balance, age, gender; - fetched rows / total rows = 1/1 - +----------------+---------+-----+--------+ - | account_number | balance | age | gender | - |----------------+---------+-----+--------| - | 6 | 5686 | 36 | M | - +----------------+---------+-----+--------+ - -Example 8: NOT Conditions -========================= - -The example shows how to use NOT operator to exclude matching records. - -PPL query:: - - os> source=accounts | where NOT state = 'CA' | fields account_number, state; - fetched rows / total rows = 4/4 - +----------------+-------+ - | account_number | state | - |----------------+-------| - | 1 | IL | - | 6 | TN | - | 13 | VA | - | 18 | MD | - +----------------+-------+ - diff --git a/docs/user/ppl/functions/aggregations.md b/docs/user/ppl/functions/aggregations.md new file mode 100644 index 00000000000..c11a7687cb8 --- /dev/null +++ b/docs/user/ppl/functions/aggregations.md @@ -0,0 +1,653 @@ +# Aggregation Functions + +## Description + +Aggregation functions perform calculations across multiple rows to return a single result value. These functions are used with `stats` and `eventstats` commands to analyze and summarize data. +The following table shows how NULL/MISSING values are handled by aggregation functions: + +| Function | NULL | MISSING | +| --- | --- | --- | +| COUNT | Not counted | Not counted | +| SUM | Ignore | Ignore | +| AVG | Ignore | Ignore | +| MAX | Ignore | Ignore | +| MIN | Ignore | Ignore | +| FIRST | Ignore | Ignore | +| LAST | Ignore | Ignore | +| LIST | Ignore | Ignore | +| VALUES | Ignore | Ignore | + +## Functions + +### COUNT + +#### Description + +Usage: Returns a count of the number of expr in the rows retrieved. The `C()` function, `c`, and `count` can be used as abbreviations for `COUNT()`. To perform a filtered counting, wrap the condition to satisfy in an `eval` expression. +Example + +```ppl +source=accounts +| stats count(), c(), count, c +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------+-----+-------+---+ +| count() | c() | count | c | +|---------+-----+-------+---| +| 4 | 4 | 4 | 4 | ++---------+-----+-------+---+ +``` + +Example of filtered counting + +```ppl +source=accounts +| stats count(eval(age > 30)) as mature_users +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------+ +| mature_users | +|--------------| +| 3 | ++--------------+ +``` + +### SUM + +#### Description + +Usage: SUM(expr). Returns the sum of expr. +Example + +```ppl +source=accounts +| stats sum(age) by gender +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++----------+--------+ +| sum(age) | gender | +|----------+--------| +| 28 | F | +| 101 | M | ++----------+--------+ +``` + +### AVG + +#### Description + +Usage: AVG(expr). Returns the average value of expr. +Example + +```ppl +source=accounts +| stats avg(age) by gender +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++--------------------+--------+ +| avg(age) | gender | +|--------------------+--------| +| 28.0 | F | +| 33.666666666666664 | M | ++--------------------+--------+ +``` + +### MAX + +#### Description + +Usage: MAX(expr). Returns the maximum value of expr. +For non-numeric fields, values are sorted lexicographically. +Example + +```ppl +source=accounts +| stats max(age) +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------+ +| max(age) | +|----------| +| 36 | ++----------+ +``` + +Example with text field + +```ppl +source=accounts +| stats max(firstname) +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------+ +| max(firstname) | +|----------------| +| Nanette | ++----------------+ +``` + +### MIN + +#### Description + +Usage: MIN(expr). Returns the minimum value of expr. +For non-numeric fields, values are sorted lexicographically. +Example + +```ppl +source=accounts +| stats min(age) +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------+ +| min(age) | +|----------| +| 28 | ++----------+ +``` + +Example with text field + +```ppl +source=accounts +| stats min(firstname) +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------+ +| min(firstname) | +|----------------| +| Amber | ++----------------+ +``` + +### VAR_SAMP + +#### Description + +Usage: VAR_SAMP(expr). Returns the sample variance of expr. +Example + +```ppl +source=accounts +| stats var_samp(age) +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------+ +| var_samp(age) | +|--------------------| +| 10.916666666666666 | ++--------------------+ +``` + +### VAR_POP + +#### Description + +Usage: VAR_POP(expr). Returns the population standard variance of expr. +Example + +```ppl +source=accounts +| stats var_pop(age) +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------+ +| var_pop(age) | +|--------------| +| 8.1875 | ++--------------+ +``` + +### STDDEV_SAMP + +#### Description + +Usage: STDDEV_SAMP(expr). Return the sample standard deviation of expr. +Example + +```ppl +source=accounts +| stats stddev_samp(age) +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------+ +| stddev_samp(age) | +|-------------------| +| 3.304037933599835 | ++-------------------+ +``` + +### STDDEV_POP + +#### Description + +Usage: STDDEV_POP(expr). Return the population standard deviation of expr. +Example + +```ppl +source=accounts +| stats stddev_pop(age) +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------+ +| stddev_pop(age) | +|--------------------| +| 2.8613807855648994 | ++--------------------+ +``` + +### DISTINCT_COUNT, DC + +#### Description + +Usage: DISTINCT_COUNT(expr), DC(expr). Returns the approximate number of distinct values using the HyperLogLog++ algorithm. Both functions are equivalent. +For details on algorithm accuracy and precision control, see the [OpenSearch Cardinality Aggregation documentation](https://docs.opensearch.org/latest/aggregations/metric/cardinality/#controlling-precision). +Example + +```ppl +source=accounts +| stats dc(state) as distinct_states, distinct_count(state) as dc_states_alt by gender +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++-----------------+---------------+--------+ +| distinct_states | dc_states_alt | gender | +|-----------------+---------------+--------| +| 1 | 1 | F | +| 3 | 3 | M | ++-----------------+---------------+--------+ +``` + +### DISTINCT_COUNT_APPROX + +#### Description + +Usage: DISTINCT_COUNT_APPROX(expr). Return the approximate distinct count value of the expr, using the hyperloglog++ algorithm. +Example + +```ppl +source=accounts +| stats distinct_count_approx(gender) +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------------------+ +| distinct_count_approx(gender) | +|-------------------------------| +| 2 | ++-------------------------------+ +``` + +### EARLIEST + +#### Description + +Usage: EARLIEST(field [, time_field]). Return the earliest value of a field based on timestamp ordering. +* field: mandatory. The field to return the earliest value for. +* time_field: optional. The field to use for time-based ordering. Defaults to @timestamp if not specified. + +Example + +```ppl +source=events +| stats earliest(message) by host +| sort host +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++-------------------+---------+ +| earliest(message) | host | +|-------------------+---------| +| Starting up | server1 | +| Initializing | server2 | ++-------------------+---------+ +``` + +Example with custom time field + +```ppl +source=events +| stats earliest(status, event_time) by category +| sort category +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++------------------------------+----------+ +| earliest(status, event_time) | category | +|------------------------------+----------| +| pending | orders | +| active | users | ++------------------------------+----------+ +``` + +### LATEST + +#### Description + +Usage: LATEST(field [, time_field]). Return the latest value of a field based on timestamp ordering. +* field: mandatory. The field to return the latest value for. +* time_field: optional. The field to use for time-based ordering. Defaults to @timestamp if not specified. + +Example + +```ppl +source=events +| stats latest(message) by host +| sort host +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++------------------+---------+ +| latest(message) | host | +|------------------+---------| +| Shutting down | server1 | +| Maintenance mode | server2 | ++------------------+---------+ +``` + +Example with custom time field + +```ppl +source=events +| stats latest(status, event_time) by category +| sort category +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++----------------------------+----------+ +| latest(status, event_time) | category | +|----------------------------+----------| +| cancelled | orders | +| inactive | users | ++----------------------------+----------+ +``` + +### TAKE + +#### Description + +Usage: TAKE(field [, size]). Return original values of a field. It does not guarantee on the order of values. +* field: mandatory. The field must be a text field. +* size: optional integer. The number of values should be returned. Default is 10. + +Example + +```ppl +source=accounts +| stats take(firstname) +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------------+ +| take(firstname) | +|-----------------------------| +| [Amber,Hattie,Nanette,Dale] | ++-----------------------------+ +``` + +### PERCENTILE or PERCENTILE_APPROX + +#### Description + +Usage: PERCENTILE(expr, percent) or PERCENTILE_APPROX(expr, percent). Return the approximate percentile value of expr at the specified percentage. +* percent: The number must be a constant between 0 and 100. + +Note: From 3.1.0, the percentile implementation is switched to MergingDigest from AVLTreeDigest. Ref [issue link](https://github.com/opensearch-project/OpenSearch/issues/18122). +Example + +```ppl +source=accounts +| stats percentile(age, 90) by gender +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++---------------------+--------+ +| percentile(age, 90) | gender | +|---------------------+--------| +| 28 | F | +| 36 | M | ++---------------------+--------+ +``` + +#### Percentile Shortcut Functions + +For convenience, OpenSearch PPL provides shortcut functions for common percentiles: +- `PERC(expr)` - Equivalent to `PERCENTILE(expr, )` +- `P(expr)` - Equivalent to `PERCENTILE(expr, )` + +Both integer and decimal percentiles from 0 to 100 are supported (e.g., `PERC95`, `P99.5`). + +```ppl +source=accounts +| stats perc99.5(age); +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------+ +| perc99.5(age) | +|---------------| +| 36 | ++---------------+ +``` + +```ppl +source=accounts +| stats p50(age); +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------+ +| p50(age) | +|----------| +| 33 | ++----------+ +``` + +### MEDIAN + +#### Description + +Usage: MEDIAN(expr). Returns the median (50th percentile) value of `expr`. This is equivalent to `PERCENTILE(expr, 50)`. +Example + +```ppl +source=accounts +| stats median(age) +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------+ +| median(age) | +|-------------| +| 33 | ++-------------+ +``` + +### FIRST + +#### Description + +Usage: FIRST(field). Return the first non-null value of a field based on natural document order. Returns NULL if no records exist, or if all records have NULL values for the field. +* field: mandatory. The field to return the first value for. + +Example + +```ppl +source=accounts +| stats first(firstname) by gender +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++------------------+--------+ +| first(firstname) | gender | +|------------------+--------| +| Nanette | F | +| Amber | M | ++------------------+--------+ +``` + +### LAST + +#### Description + +Usage: LAST(field). Return the last non-null value of a field based on natural document order. Returns NULL if no records exist, or if all records have NULL values for the field. +* field: mandatory. The field to return the last value for. + +Example + +```ppl +source=accounts +| stats last(firstname) by gender +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++-----------------+--------+ +| last(firstname) | gender | +|-----------------+--------| +| Nanette | F | +| Dale | M | ++-----------------+--------+ +``` + +### LIST + +#### Description + +Usage: LIST(expr). Collects all values from the specified expression into an array. Values are converted to strings, nulls are filtered, and duplicates are preserved. +The function returns up to 100 values with no guaranteed ordering. +* expr: The field expression to collect values from. +* This aggregation function doesn't support Array, Struct, Object field types. + +Example with string fields + +```ppl +source=accounts +| stats list(firstname) +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------------+ +| list(firstname) | +|-----------------------------| +| [Amber,Hattie,Nanette,Dale] | ++-----------------------------+ +``` + +### VALUES + +#### Description + +Usage: VALUES(expr). Collects all unique values from the specified expression into a sorted array. Values are converted to strings, nulls are filtered, and duplicates are removed. +The maximum number of unique values returned is controlled by the `plugins.ppl.values.max.limit` setting: +* Default value is 0, which means unlimited values are returned +* Can be configured to any positive integer to limit the number of unique values +* See the [PPL Settings](../admin/settings.md#plugins-ppl-values-max-limit) documentation for more details + +Example with string fields + +```ppl +source=accounts +| stats values(firstname) +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------------+ +| values(firstname) | +|-----------------------------| +| [Amber,Dale,Hattie,Nanette] | ++-----------------------------+ +``` + \ No newline at end of file diff --git a/docs/user/ppl/functions/aggregations.rst b/docs/user/ppl/functions/aggregations.rst deleted file mode 100644 index 6605bda0765..00000000000 --- a/docs/user/ppl/functions/aggregations.rst +++ /dev/null @@ -1,522 +0,0 @@ -===================== -Aggregation Functions -===================== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Description -============ -| Aggregation functions perform calculations across multiple rows to return a single result value. These functions are used with ``stats`` and ``eventstats`` commands to analyze and summarize data. - -| The following table shows how NULL/MISSING values are handled by aggregation functions: - -+----------+-------------+-------------+ -| Function | NULL | MISSING | -+----------+-------------+-------------+ -| COUNT | Not counted | Not counted | -+----------+-------------+-------------+ -| SUM | Ignore | Ignore | -+----------+-------------+-------------+ -| AVG | Ignore | Ignore | -+----------+-------------+-------------+ -| MAX | Ignore | Ignore | -+----------+-------------+-------------+ -| MIN | Ignore | Ignore | -+----------+-------------+-------------+ -| FIRST | Ignore | Ignore | -+----------+-------------+-------------+ -| LAST | Ignore | Ignore | -+----------+-------------+-------------+ -| LIST | Ignore | Ignore | -+----------+-------------+-------------+ -| VALUES | Ignore | Ignore | -+----------+-------------+-------------+ - -Functions -========= - -COUNT ------ - -Description ->>>>>>>>>>> - -Usage: Returns a count of the number of expr in the rows retrieved. The ``C()`` function, ``c``, and ``count`` can be used as abbreviations for ``COUNT()``. To perform a filtered counting, wrap the condition to satisfy in an `eval` expression. - -Example:: - - os> source=accounts | stats count(), c(), count, c; - fetched rows / total rows = 1/1 - +---------+-----+-------+---+ - | count() | c() | count | c | - |---------+-----+-------+---| - | 4 | 4 | 4 | 4 | - +---------+-----+-------+---+ - -Example of filtered counting:: - - os> source=accounts | stats count(eval(age > 30)) as mature_users; - fetched rows / total rows = 1/1 - +--------------+ - | mature_users | - |--------------| - | 3 | - +--------------+ - -SUM ---- - -Description ->>>>>>>>>>> - -Usage: SUM(expr). Returns the sum of expr. - -Example:: - - os> source=accounts | stats sum(age) by gender; - fetched rows / total rows = 2/2 - +----------+--------+ - | sum(age) | gender | - |----------+--------| - | 28 | F | - | 101 | M | - +----------+--------+ - -AVG ---- - -Description ->>>>>>>>>>> - -Usage: AVG(expr). Returns the average value of expr. - -Example:: - - os> source=accounts | stats avg(age) by gender; - fetched rows / total rows = 2/2 - +--------------------+--------+ - | avg(age) | gender | - |--------------------+--------| - | 28.0 | F | - | 33.666666666666664 | M | - +--------------------+--------+ - -MAX ---- - -Description ->>>>>>>>>>> - -Usage: MAX(expr). Returns the maximum value of expr. - -For non-numeric fields, values are sorted lexicographically. - -Example:: - - os> source=accounts | stats max(age); - fetched rows / total rows = 1/1 - +----------+ - | max(age) | - |----------| - | 36 | - +----------+ - -Example with text field:: - - os> source=accounts | stats max(firstname); - fetched rows / total rows = 1/1 - +----------------+ - | max(firstname) | - |----------------| - | Nanette | - +----------------+ - -MIN ---- - -Description ->>>>>>>>>>> - -Usage: MIN(expr). Returns the minimum value of expr. - -For non-numeric fields, values are sorted lexicographically. - -Example:: - - os> source=accounts | stats min(age); - fetched rows / total rows = 1/1 - +----------+ - | min(age) | - |----------| - | 28 | - +----------+ - -Example with text field:: - - os> source=accounts | stats min(firstname); - fetched rows / total rows = 1/1 - +----------------+ - | min(firstname) | - |----------------| - | Amber | - +----------------+ - -VAR_SAMP --------- - -Description ->>>>>>>>>>> - -Usage: VAR_SAMP(expr). Returns the sample variance of expr. - -Example:: - - os> source=accounts | stats var_samp(age); - fetched rows / total rows = 1/1 - +--------------------+ - | var_samp(age) | - |--------------------| - | 10.916666666666666 | - +--------------------+ - -VAR_POP -------- - -Description ->>>>>>>>>>> - -Usage: VAR_POP(expr). Returns the population standard variance of expr. - -Example:: - - os> source=accounts | stats var_pop(age); - fetched rows / total rows = 1/1 - +--------------+ - | var_pop(age) | - |--------------| - | 8.1875 | - +--------------+ - -STDDEV_SAMP ------------ - -Description ->>>>>>>>>>> - -Usage: STDDEV_SAMP(expr). Return the sample standard deviation of expr. - -Example:: - - os> source=accounts | stats stddev_samp(age); - fetched rows / total rows = 1/1 - +-------------------+ - | stddev_samp(age) | - |-------------------| - | 3.304037933599835 | - +-------------------+ - -STDDEV_POP ----------- - -Description ->>>>>>>>>>> - -Usage: STDDEV_POP(expr). Return the population standard deviation of expr. - -Example:: - - os> source=accounts | stats stddev_pop(age); - fetched rows / total rows = 1/1 - +--------------------+ - | stddev_pop(age) | - |--------------------| - | 2.8613807855648994 | - +--------------------+ - -DISTINCT_COUNT, DC ------------------- - -Description ->>>>>>>>>>> - -Usage: DISTINCT_COUNT(expr), DC(expr). Returns the approximate number of distinct values using the HyperLogLog++ algorithm. Both functions are equivalent. - -For details on algorithm accuracy and precision control, see the `OpenSearch Cardinality Aggregation documentation `_. - -Example:: - - os> source=accounts | stats dc(state) as distinct_states, distinct_count(state) as dc_states_alt by gender; - fetched rows / total rows = 4/4 - +-----------------+---------------+--------+ - | distinct_states | dc_states_alt | gender | - |-----------------+---------------+--------| - | 3 | 3 | M | - | 1 | 1 | F | - +-----------------+---------------+--------| - -DISTINCT_COUNT_APPROX ---------------------- - -Description ->>>>>>>>>>> - -Usage: DISTINCT_COUNT_APPROX(expr). Return the approximate distinct count value of the expr, using the hyperloglog++ algorithm. - -Example:: - - PPL> source=accounts | stats distinct_count_approx(gender); - fetched rows / total rows = 1/1 - +-------------------------------+ - | distinct_count_approx(gender) | - |-------------------------------| - | 2 | - +-------------------------------+ - -EARLIEST --------- - -Description ->>>>>>>>>>> - -Usage: EARLIEST(field [, time_field]). Return the earliest value of a field based on timestamp ordering. - -* field: mandatory. The field to return the earliest value for. -* time_field: optional. The field to use for time-based ordering. Defaults to @timestamp if not specified. - -Example:: - - os> source=events | stats earliest(message) by host | sort host; - fetched rows / total rows = 2/2 - +-------------------+---------+ - | earliest(message) | host | - |-------------------+---------| - | Starting up | server1 | - | Initializing | server2 | - +-------------------+---------+ - -Example with custom time field:: - - os> source=events | stats earliest(status, event_time) by category | sort category; - fetched rows / total rows = 2/2 - +------------------------------+----------+ - | earliest(status, event_time) | category | - |------------------------------+----------| - | pending | orders | - | active | users | - +------------------------------+----------+ - -LATEST ------- - -Description ->>>>>>>>>>> - -Usage: LATEST(field [, time_field]). Return the latest value of a field based on timestamp ordering. - -* field: mandatory. The field to return the latest value for. -* time_field: optional. The field to use for time-based ordering. Defaults to @timestamp if not specified. - -Example:: - - os> source=events | stats latest(message) by host | sort host; - fetched rows / total rows = 2/2 - +------------------+---------+ - | latest(message) | host | - |------------------+---------| - | Shutting down | server1 | - | Maintenance mode | server2 | - +------------------+---------+ - -Example with custom time field:: - - os> source=events | stats latest(status, event_time) by category | sort category; - fetched rows / total rows = 2/2 - +----------------------------+----------+ - | latest(status, event_time) | category | - |----------------------------+----------| - | cancelled | orders | - | inactive | users | - +----------------------------+----------+ - -TAKE ----- - -Description ->>>>>>>>>>> - -Usage: TAKE(field [, size]). Return original values of a field. It does not guarantee on the order of values. - -* field: mandatory. The field must be a text field. -* size: optional integer. The number of values should be returned. Default is 10. - -Example:: - - os> source=accounts | stats take(firstname); - fetched rows / total rows = 1/1 - +-----------------------------+ - | take(firstname) | - |-----------------------------| - | [Amber,Hattie,Nanette,Dale] | - +-----------------------------+ - -PERCENTILE or PERCENTILE_APPROX -------------------------------- - -Description ->>>>>>>>>>> - -Usage: PERCENTILE(expr, percent) or PERCENTILE_APPROX(expr, percent). Return the approximate percentile value of expr at the specified percentage. - -* percent: The number must be a constant between 0 and 100. - -Note: From 3.1.0, the percentile implementation is switched to MergingDigest from AVLTreeDigest. Ref `issue link `_. - -Example:: - - os> source=accounts | stats percentile(age, 90) by gender; - fetched rows / total rows = 2/2 - +---------------------+--------+ - | percentile(age, 90) | gender | - |---------------------+--------| - | 28 | F | - | 36 | M | - +---------------------+--------+ - -Percentile Shortcut Functions ->>>>>>>>>>>>>>>>>>>>>>>>>>>>> - -For convenience, OpenSearch PPL provides shortcut functions for common percentiles: - -- ``PERC(expr)`` - Equivalent to ``PERCENTILE(expr, )`` -- ``P(expr)`` - Equivalent to ``PERCENTILE(expr, )`` - -Both integer and decimal percentiles from 0 to 100 are supported (e.g., ``PERC95``, ``P99.5``). - -Example:: - - ppl> source=accounts | stats perc99.5(age); - fetched rows / total rows = 1/1 - +---------------+ - | perc99.5(age) | - |---------------| - | 36 | - +---------------+ - - ppl> source=accounts | stats p50(age); - fetched rows / total rows = 1/1 - +---------+ - | p50(age) | - |---------| - | 32 | - +---------+ - -MEDIAN ------- - -Description ->>>>>>>>>>> - -Usage: MEDIAN(expr). Returns the median (50th percentile) value of `expr`. This is equivalent to ``PERCENTILE(expr, 50)``. - -Example:: - - os> source=accounts | stats median(age); - fetched rows / total rows = 1/1 - +-------------+ - | median(age) | - |-------------| - | 33 | - +-------------+ - -FIRST ------ - -Description ->>>>>>>>>>> - -Usage: FIRST(field). Return the first non-null value of a field based on natural document order. Returns NULL if no records exist, or if all records have NULL values for the field. - -* field: mandatory. The field to return the first value for. - -Example:: - - os> source=accounts | stats first(firstname) by gender; - fetched rows / total rows = 2/2 - +------------------+--------+ - | first(firstname) | gender | - |------------------+--------| - | Nanette | F | - | Amber | M | - +------------------+--------+ - -LAST ----- - -Description ->>>>>>>>>>> - -Usage: LAST(field). Return the last non-null value of a field based on natural document order. Returns NULL if no records exist, or if all records have NULL values for the field. - -* field: mandatory. The field to return the last value for. - -Example:: - - os> source=accounts | stats last(firstname) by gender; - fetched rows / total rows = 2/2 - +-----------------+--------+ - | last(firstname) | gender | - |-----------------+--------| - | Nanette | F | - | Dale | M | - +-----------------+--------+ - -LIST ----- - -Description ->>>>>>>>>>> - -Usage: LIST(expr). Collects all values from the specified expression into an array. Values are converted to strings, nulls are filtered, and duplicates are preserved. -The function returns up to 100 values with no guaranteed ordering. - -* expr: The field expression to collect values from. -* This aggregation function doesn't support Array, Struct, Object field types. - -Example with string fields:: - - PPL> source=accounts | stats list(firstname); - fetched rows / total rows = 1/1 - +-------------------------------------+ - | list(firstname) | - |-------------------------------------| - | ["Amber","Hattie","Nanette","Dale"] | - +-------------------------------------+ - -VALUES ------- - -Description ->>>>>>>>>>> - -Usage: VALUES(expr). Collects all unique values from the specified expression into a sorted array. Values are converted to strings, nulls are filtered, and duplicates are removed. - -The maximum number of unique values returned is controlled by the ``plugins.ppl.values.max.limit`` setting: - -* Default value is 0, which means unlimited values are returned -* Can be configured to any positive integer to limit the number of unique values -* See the `PPL Settings <../admin/settings.rst#plugins-ppl-values-max-limit>`_ documentation for more details - -Example with string fields:: - - PPL> source=accounts | stats values(firstname); - fetched rows / total rows = 1/1 - +-------------------------------------+ - | values(firstname) | - |-------------------------------------| - | ["Amber","Dale","Hattie","Nanette"] | - +-------------------------------------+ \ No newline at end of file diff --git a/docs/user/ppl/functions/collection.md b/docs/user/ppl/functions/collection.md new file mode 100644 index 00000000000..3c004a22107 --- /dev/null +++ b/docs/user/ppl/functions/collection.md @@ -0,0 +1,727 @@ +# PPL Collection Functions + +## ARRAY + +### Description + +Usage: `array(value1, value2, value3...)` create an array with input values. Currently we don't allow mixture types. We will infer a least restricted type, for example `array(1, "demo")` -> ["1", "demo"] +Argument type: value1: ANY, value2: ANY, ... +Return type: ARRAY +Example + +```ppl +source=people +| eval array = array(1, 2, 3) +| fields array +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------+ +| array | +|---------| +| [1,2,3] | ++---------+ +``` + +```ppl +source=people +| eval array = array(1, "demo") +| fields array +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------+ +| array | +|----------| +| [1,demo] | ++----------+ +``` + +## ARRAY_LENGTH + +### Description + +Usage: `array_length(array)` returns the length of input array. +Argument type: array:ARRAY +Return type: INTEGER +Example + +```ppl +source=people +| eval array = array(1, 2, 3) +| eval length = array_length(array) +| fields length +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------+ +| length | +|--------| +| 3 | ++--------+ +``` + +## FORALL + +### Description + +Usage: `forall(array, function)` check whether all element inside array can meet the lambda function. The function should also return boolean. The lambda function accepts one single input. +Argument type: array:ARRAY, function:LAMBDA +Return type: BOOLEAN +Example + +```ppl +source=people +| eval array = array(1, 2, 3), result = forall(array, x -> x > 0) +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------+ +| result | +|--------| +| True | ++--------+ +``` + +## EXISTS + +### Description + +Usage: `exists(array, function)` check whether existing one of element inside array can meet the lambda function. The function should also return boolean. The lambda function accepts one single input. +Argument type: array:ARRAY, function:LAMBDA +Return type: BOOLEAN +Example + +```ppl +source=people +| eval array = array(-1, -2, 3), result = exists(array, x -> x > 0) +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------+ +| result | +|--------| +| True | ++--------+ +``` + +## FILTER + +### Description + +Usage: `filter(array, function)` filter the element in the array by the lambda function. The function should return boolean. The lambda function accepts one single input. +Argument type: array:ARRAY, function:LAMBDA +Return type: ARRAY +Example + +```ppl +source=people +| eval array = array(1, -2, 3), result = filter(array, x -> x > 0) +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------+ +| result | +|--------| +| [1,3] | ++--------+ +``` + +## TRANSFORM + +### Description + +Usage: `transform(array, function)` transform the element of array one by one using lambda. The lambda function can accept one single input or two input. If the lambda accepts two argument, the second one is the index of element in array. +Argument type: array:ARRAY, function:LAMBDA +Return type: ARRAY +Example + +```ppl +source=people +| eval array = array(1, -2, 3), result = transform(array, x -> x + 2) +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------+ +| result | +|---------| +| [3,0,5] | ++---------+ +``` + +```ppl +source=people +| eval array = array(1, -2, 3), result = transform(array, (x, i) -> x + i) +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------+ +| result | +|----------| +| [1,-1,5] | ++----------+ +``` + +## REDUCE + +### Description + +Usage: `reduce(array, acc_base, function, )` use lambda function to go through all element and interact with acc_base. The lambda function accept two argument accumulator and array element. If add one more reduce_function, will apply reduce_function to accumulator finally. The reduce function accept accumulator as the one argument. +Argument type: array:ARRAY, acc_base:ANY, function:LAMBDA, reduce_function:LAMBDA +Return type: ANY +Example + +```ppl +source=people +| eval array = array(1, -2, 3), result = reduce(array, 10, (acc, x) -> acc + x) +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------+ +| result | +|--------| +| 12 | ++--------+ +``` + +```ppl +source=people +| eval array = array(1, -2, 3), result = reduce(array, 10, (acc, x) -> acc + x, acc -> acc * 10) +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------+ +| result | +|--------| +| 120 | ++--------+ +``` + +## MVJOIN + +### Description + +Usage: mvjoin(array, delimiter) joins string array elements into a single string, separated by the specified delimiter. NULL elements are excluded from the output. Only string arrays are supported. +Argument type: array: ARRAY of STRING, delimiter: STRING +Return type: STRING +Example + +```ppl +source=people +| eval result = mvjoin(array('a', 'b', 'c'), ',') +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------+ +| result | +|--------| +| a,b,c | ++--------+ +``` + +```ppl +source=accounts +| eval names_array = array(firstname, lastname) +| eval result = mvjoin(names_array, ', ') +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------+ +| result | +|-------------| +| Amber, Duke | ++-------------+ +``` + +## MVAPPEND + +### Description + +Usage: mvappend(value1, value2, value3...) appends all elements from arguments to create an array. Flattens array arguments and collects all individual elements. Always returns an array or null for consistent type behavior. +Argument type: value1: ANY, value2: ANY, ... +Return type: ARRAY +Example + +```ppl +source=people +| eval result = mvappend(1, 1, 3) +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------+ +| result | +|---------| +| [1,1,3] | ++---------+ +``` + +```ppl +source=people +| eval result = mvappend(1, array(2, 3)) +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------+ +| result | +|---------| +| [1,2,3] | ++---------+ +``` + +```ppl +source=people +| eval result = mvappend(mvappend(1, 2), 3) +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------+ +| result | +|---------| +| [1,2,3] | ++---------+ +``` + +```ppl +source=people +| eval result = mvappend(42) +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------+ +| result | +|--------| +| [42] | ++--------+ +``` + +```ppl +source=people +| eval result = mvappend(nullif(1, 1), 2) +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------+ +| result | +|--------| +| [2] | ++--------+ +``` + +```ppl +source=people +| eval result = mvappend(nullif(1, 1)) +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------+ +| result | +|--------| +| null | ++--------+ +``` + +```ppl +source=people +| eval arr1 = array(1, 2), arr2 = array(3, 4), result = mvappend(arr1, arr2) +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------+ +| result | +|-----------| +| [1,2,3,4] | ++-----------+ +``` + +```ppl +source=accounts +| eval result = mvappend(firstname, lastname) +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------+ +| result | +|--------------| +| [Amber,Duke] | ++--------------+ +``` + +```ppl +source=people +| eval result = mvappend(1, 'text', 2.5) +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------+ +| result | +|--------------| +| [1,text,2.5] | ++--------------+ +``` + +## SPLIT + +### Description + +Usage: split(str, delimiter) splits the string values on the delimiter and returns the string values as a multivalue field (array). Use an empty string ("") to split the original string into one value per character. If the delimiter is not found, returns an array containing the original string. If the input string is empty, returns an empty array. + +Argument type: str: STRING, delimiter: STRING + +Return type: ARRAY of STRING + +### Example + +```ppl +source=people +| eval test = 'buttercup;rarity;tenderhoof;dash', result = split(test, ';') +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++------------------------------------+ +| result | +|------------------------------------| +| [buttercup,rarity,tenderhoof,dash] | ++------------------------------------+ +``` + +```ppl +source=people +| eval test = '1a2b3c4def567890', result = split(test, 'def') +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++------------------+ +| result | +|------------------| +| [1a2b3c4,567890] | ++------------------+ +``` + +```ppl +source=people +| eval test = 'abcd', result = split(test, '') +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------+ +| result | +|-----------| +| [a,b,c,d] | ++-----------+ +``` + +```ppl +source=people +| eval test = 'name::value', result = split(test, '::') +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------+ +| result | +|--------------| +| [name,value] | ++--------------+ +``` + +```ppl +source=people +| eval test = 'hello', result = split(test, ',') +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------+ +| result | +|---------| +| [hello] | ++---------+ +``` + +## MVDEDUP + +### Description + +Usage: mvdedup(array) removes duplicate values from a multivalue array while preserving the order of first occurrence. NULL elements are filtered out. Returns an array with duplicates removed, or null if the input is null. +Argument type: array: ARRAY +Return type: ARRAY +Example + +```ppl +source=people +| eval array = array(1, 2, 2, 3, 1, 4), result = mvdedup(array) +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------+ +| result | +|-----------| +| [1,2,3,4] | ++-----------+ +``` + +```ppl +source=people +| eval array = array('z', 'a', 'z', 'b', 'a', 'c'), result = mvdedup(array) +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------+ +| result | +|-----------| +| [z,a,b,c] | ++-----------+ +``` + +```ppl +source=people +| eval array = array(), result = mvdedup(array) +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------+ +| result | +|--------| +| [] | ++--------+ +``` + +## MVINDEX + +### Description + +Usage: mvindex(array, start, [end]) returns a subset of the multivalue array using the start and optional end index values. Indexes are 0-based (first element is at index 0). Supports negative indexing where -1 refers to the last element. When only start is provided, returns a single element. When both start and end are provided, returns an array of elements from start to end (inclusive). +Argument type: array: ARRAY, start: INTEGER, end: INTEGER (optional) +Return type: ANY (single element) or ARRAY (range) +Example + +```ppl +source=people +| eval array = array('a', 'b', 'c', 'd', 'e'), result = mvindex(array, 1) +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------+ +| result | +|--------| +| b | ++--------+ +``` + +```ppl +source=people +| eval array = array('a', 'b', 'c', 'd', 'e'), result = mvindex(array, -1) +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------+ +| result | +|--------| +| e | ++--------+ +``` + +```ppl +source=people +| eval array = array(1, 2, 3, 4, 5), result = mvindex(array, 1, 3) +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------+ +| result | +|---------| +| [2,3,4] | ++---------+ +``` + +```ppl +source=people +| eval array = array(1, 2, 3, 4, 5), result = mvindex(array, -3, -1) +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------+ +| result | +|---------| +| [3,4,5] | ++---------+ +``` + +```ppl +source=people +| eval array = array('alex', 'celestino', 'claudia', 'david'), result = mvindex(array, 0, 2) +| fields result +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------------+ +| result | +|--------------------------| +| [alex,celestino,claudia] | ++--------------------------+ +``` + \ No newline at end of file diff --git a/docs/user/ppl/functions/collection.rst b/docs/user/ppl/functions/collection.rst deleted file mode 100644 index fdea75d3e81..00000000000 --- a/docs/user/ppl/functions/collection.rst +++ /dev/null @@ -1,450 +0,0 @@ -=========================== -PPL Collection Functions -=========================== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 1 - -ARRAY ------ - -Description ->>>>>>>>>>> - -Usage: ``array(value1, value2, value3...)`` create an array with input values. Currently we don't allow mixture types. We will infer a least restricted type, for example ``array(1, "demo")`` -> ["1", "demo"] - -Argument type: value1: ANY, value2: ANY, ... - -Return type: ARRAY - -Example:: - - os> source=people | eval array = array(1, 2, 3) | fields array | head 1 - fetched rows / total rows = 1/1 - +---------+ - | array | - |---------| - | [1,2,3] | - +---------+ - - os> source=people | eval array = array(1, "demo") | fields array | head 1 - fetched rows / total rows = 1/1 - +----------+ - | array | - |----------| - | [1,demo] | - +----------+ - -ARRAY_LENGTH ------------- - -Description ->>>>>>>>>>> - -Usage: ``array_length(array)`` returns the length of input array. - -Argument type: array:ARRAY - -Return type: INTEGER - -Example:: - - os> source=people | eval array = array(1, 2, 3) | eval length = array_length(array) | fields length | head 1 - fetched rows / total rows = 1/1 - +--------+ - | length | - |--------| - | 3 | - +--------+ - -FORALL ------- - -Description ->>>>>>>>>>> - -Usage: ``forall(array, function)`` check whether all element inside array can meet the lambda function. The function should also return boolean. The lambda function accepts one single input. - -Argument type: array:ARRAY, function:LAMBDA - -Return type: BOOLEAN - -Example:: - - os> source=people | eval array = array(1, 2, 3), result = forall(array, x -> x > 0) | fields result | head 1 - fetched rows / total rows = 1/1 - +--------+ - | result | - |--------| - | True | - +--------+ - -EXISTS ------- - -Description ->>>>>>>>>>> - -Usage: ``exists(array, function)`` check whether existing one of element inside array can meet the lambda function. The function should also return boolean. The lambda function accepts one single input. - -Argument type: array:ARRAY, function:LAMBDA - -Return type: BOOLEAN - -Example:: - - os> source=people | eval array = array(-1, -2, 3), result = exists(array, x -> x > 0) | fields result | head 1 - fetched rows / total rows = 1/1 - +--------+ - | result | - |--------| - | True | - +--------+ - -FILTER ------- - -Description ->>>>>>>>>>> - -Usage: ``filter(array, function)`` filter the element in the array by the lambda function. The function should return boolean. The lambda function accepts one single input. - -Argument type: array:ARRAY, function:LAMBDA - -Return type: ARRAY - -Example:: - - os> source=people | eval array = array(1, -2, 3), result = filter(array, x -> x > 0) | fields result | head 1 - fetched rows / total rows = 1/1 - +--------+ - | result | - |--------| - | [1,3] | - +--------+ - -TRANSFORM ---------- - -Description ->>>>>>>>>>> - -Usage: ``transform(array, function)`` transform the element of array one by one using lambda. The lambda function can accept one single input or two input. If the lambda accepts two argument, the second one is the index of element in array. - -Argument type: array:ARRAY, function:LAMBDA - -Return type: ARRAY - -Example:: - - os> source=people | eval array = array(1, -2, 3), result = transform(array, x -> x + 2) | fields result | head 1 - fetched rows / total rows = 1/1 - +---------+ - | result | - |---------| - | [3,0,5] | - +---------+ - - os> source=people | eval array = array(1, -2, 3), result = transform(array, (x, i) -> x + i) | fields result | head 1 - fetched rows / total rows = 1/1 - +----------+ - | result | - |----------| - | [1,-1,5] | - +----------+ - -REDUCE ------- - -Description ->>>>>>>>>>> - -Usage: ``reduce(array, acc_base, function, )`` use lambda function to go through all element and interact with acc_base. The lambda function accept two argument accumulator and array element. If add one more reduce_function, will apply reduce_function to accumulator finally. The reduce function accept accumulator as the one argument. - -Argument type: array:ARRAY, acc_base:ANY, function:LAMBDA, reduce_function:LAMBDA - -Return type: ANY - -Example:: - - os> source=people | eval array = array(1, -2, 3), result = reduce(array, 10, (acc, x) -> acc + x) | fields result | head 1 - fetched rows / total rows = 1/1 - +--------+ - | result | - |--------| - | 12 | - +--------+ - - os> source=people | eval array = array(1, -2, 3), result = reduce(array, 10, (acc, x) -> acc + x, acc -> acc * 10) | fields result | head 1 - fetched rows / total rows = 1/1 - +--------+ - | result | - |--------| - | 120 | - +--------+ - -SPLIT ------ - -Description ->>>>>>>>>>> - -Usage: split(str, delimiter) splits the string values on the delimiter and returns the string values as a multivalue field (array). Use an empty string ("") to split the original string into one value per character. If the delimiter is not found, returns an array containing the original string. If the input string is empty, returns an empty array. - -Argument type: str: STRING, delimiter: STRING - -Return type: ARRAY of STRING - -Example:: - - os> source=people | eval test = 'buttercup;rarity;tenderhoof;dash', result = split(test, ';') | fields result | head 1 - fetched rows / total rows = 1/1 - +------------------------------------+ - | result | - |------------------------------------| - | [buttercup,rarity,tenderhoof,dash] | - +------------------------------------+ - - os> source=people | eval test = '1a2b3c4def567890', result = split(test, 'def') | fields result | head 1 - fetched rows / total rows = 1/1 - +------------------+ - | result | - |------------------| - | [1a2b3c4,567890] | - +------------------+ - - os> source=people | eval test = 'abcd', result = split(test, '') | fields result | head 1 - fetched rows / total rows = 1/1 - +-----------+ - | result | - |-----------| - | [a,b,c,d] | - +-----------+ - - os> source=people | eval test = 'name::value', result = split(test, '::') | fields result | head 1 - fetched rows / total rows = 1/1 - +--------------+ - | result | - |--------------| - | [name,value] | - +--------------+ - - os> source=people | eval test = 'hello', result = split(test, ',') | fields result | head 1 - fetched rows / total rows = 1/1 - +---------+ - | result | - |---------| - | [hello] | - +---------+ - -MVJOIN ------- - -Description ->>>>>>>>>>> - -Usage: mvjoin(array, delimiter) joins string array elements into a single string, separated by the specified delimiter. NULL elements are excluded from the output. Only string arrays are supported. - -Argument type: array: ARRAY of STRING, delimiter: STRING - -Return type: STRING - -Example:: - - os> source=people | eval result = mvjoin(array('a', 'b', 'c'), ',') | fields result | head 1 - fetched rows / total rows = 1/1 - +--------+ - | result | - |--------| - | a,b,c | - +--------+ - - os> source=accounts | eval names_array = array(firstname, lastname) | eval result = mvjoin(names_array, ', ') | fields result | head 1 - fetched rows / total rows = 1/1 - +-------------+ - | result | - |-------------| - | Amber, Duke | - +-------------+ - -MVAPPEND --------- - -Description ->>>>>>>>>>> - -Usage: mvappend(value1, value2, value3...) appends all elements from arguments to create an array. Flattens array arguments and collects all individual elements. Always returns an array or null for consistent type behavior. - -Argument type: value1: ANY, value2: ANY, ... - -Return type: ARRAY - -Example:: - - os> source=people | eval result = mvappend(1, 1, 3) | fields result | head 1 - fetched rows / total rows = 1/1 - +---------+ - | result | - |---------| - | [1,1,3] | - +---------+ - - os> source=people | eval result = mvappend(1, array(2, 3)) | fields result | head 1 - fetched rows / total rows = 1/1 - +---------+ - | result | - |---------| - | [1,2,3] | - +---------+ - - os> source=people | eval result = mvappend(mvappend(1, 2), 3) | fields result | head 1 - fetched rows / total rows = 1/1 - +---------+ - | result | - |---------| - | [1,2,3] | - +---------+ - - os> source=people | eval result = mvappend(42) | fields result | head 1 - fetched rows / total rows = 1/1 - +--------+ - | result | - |--------| - | [42] | - +--------+ - - os> source=people | eval result = mvappend(nullif(1, 1), 2) | fields result | head 1 - fetched rows / total rows = 1/1 - +--------+ - | result | - |--------| - | [2] | - +--------+ - - os> source=people | eval result = mvappend(nullif(1, 1)) | fields result | head 1 - fetched rows / total rows = 1/1 - +--------+ - | result | - |--------| - | null | - +--------+ - - os> source=people | eval arr1 = array(1, 2), arr2 = array(3, 4), result = mvappend(arr1, arr2) | fields result | head 1 - fetched rows / total rows = 1/1 - +-----------+ - | result | - |-----------| - | [1,2,3,4] | - +-----------+ - - os> source=accounts | eval result = mvappend(firstname, lastname) | fields result | head 1 - fetched rows / total rows = 1/1 - +--------------+ - | result | - |--------------| - | [Amber,Duke] | - +--------------+ - - os> source=people | eval result = mvappend(1, 'text', 2.5) | fields result | head 1 - fetched rows / total rows = 1/1 - +--------------+ - | result | - |--------------| - | [1,text,2.5] | - +--------------+ - -MVDEDUP -------- - -Description ->>>>>>>>>>> - -Usage: mvdedup(array) removes duplicate values from a multivalue array while preserving the order of first occurrence. NULL elements are filtered out. Returns an array with duplicates removed, or null if the input is null. - -Argument type: array: ARRAY - -Return type: ARRAY - -Example:: - - os> source=people | eval array = array(1, 2, 2, 3, 1, 4), result = mvdedup(array) | fields result | head 1 - fetched rows / total rows = 1/1 - +-----------+ - | result | - |-----------| - | [1,2,3,4] | - +-----------+ - - os> source=people | eval array = array('z', 'a', 'z', 'b', 'a', 'c'), result = mvdedup(array) | fields result | head 1 - fetched rows / total rows = 1/1 - +-----------+ - | result | - |-----------| - | [z,a,b,c] | - +-----------+ - - os> source=people | eval array = array(), result = mvdedup(array) | fields result | head 1 - fetched rows / total rows = 1/1 - +--------+ - | result | - |--------| - | [] | - +--------+ - -MVINDEX -------- - -Description ->>>>>>>>>>> - -Usage: mvindex(array, start, [end]) returns a subset of the multivalue array using the start and optional end index values. Indexes are 0-based (first element is at index 0). Supports negative indexing where -1 refers to the last element. When only start is provided, returns a single element. When both start and end are provided, returns an array of elements from start to end (inclusive). - -Argument type: array: ARRAY, start: INTEGER, end: INTEGER (optional) - -Return type: ANY (single element) or ARRAY (range) - -Example:: - - os> source=people | eval array = array('a', 'b', 'c', 'd', 'e'), result = mvindex(array, 1) | fields result | head 1 - fetched rows / total rows = 1/1 - +--------+ - | result | - |--------| - | b | - +--------+ - - os> source=people | eval array = array('a', 'b', 'c', 'd', 'e'), result = mvindex(array, -1) | fields result | head 1 - fetched rows / total rows = 1/1 - +--------+ - | result | - |--------| - | e | - +--------+ - - os> source=people | eval array = array(1, 2, 3, 4, 5), result = mvindex(array, 1, 3) | fields result | head 1 - fetched rows / total rows = 1/1 - +---------+ - | result | - |---------| - | [2,3,4] | - +---------+ - - os> source=people | eval array = array(1, 2, 3, 4, 5), result = mvindex(array, -3, -1) | fields result | head 1 - fetched rows / total rows = 1/1 - +---------+ - | result | - |---------| - | [3,4,5] | - +---------+ - - os> source=people | eval array = array('alex', 'celestino', 'claudia', 'david'), result = mvindex(array, 0, 2) | fields result | head 1 - fetched rows / total rows = 1/1 - +--------------------------+ - | result | - |--------------------------| - | [alex,celestino,claudia] | - +--------------------------+ - diff --git a/docs/user/ppl/functions/condition.md b/docs/user/ppl/functions/condition.md new file mode 100644 index 00000000000..8d65680fcda --- /dev/null +++ b/docs/user/ppl/functions/condition.md @@ -0,0 +1,803 @@ +# Condition Functions + +## ISNULL + +### Description + +Usage: isnull(field) returns TRUE if field is NULL, FALSE otherwise. +The `isnull()` function is commonly used: +- In `eval` expressions to create conditional fields +- With the `if()` function to provide default values +- In `where` clauses to filter null records + +Argument type: all the supported data types. +Return type: BOOLEAN +Example + +```ppl +source=accounts +| eval result = isnull(employer) +| fields result, employer, firstname +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++--------+----------+-----------+ +| result | employer | firstname | +|--------+----------+-----------| +| False | Pyrami | Amber | +| False | Netagy | Hattie | +| False | Quility | Nanette | +| True | null | Dale | ++--------+----------+-----------+ +``` + +Using with if() to label records + +```ppl +source=accounts +| eval status = if(isnull(employer), 'unemployed', 'employed') +| fields firstname, employer, status +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------+----------+------------+ +| firstname | employer | status | +|-----------+----------+------------| +| Amber | Pyrami | employed | +| Hattie | Netagy | employed | +| Nanette | Quility | employed | +| Dale | null | unemployed | ++-----------+----------+------------+ +``` + +Filtering with where clause + +```ppl +source=accounts +| where isnull(employer) +| fields account_number, firstname, employer +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------+-----------+----------+ +| account_number | firstname | employer | +|----------------+-----------+----------| +| 18 | Dale | null | ++----------------+-----------+----------+ +``` + +## ISNOTNULL + +### Description + +Usage: isnotnull(field) returns TRUE if field is NOT NULL, FALSE otherwise. +The `isnotnull()` function is commonly used: +- In `eval` expressions to create boolean flags +- In `where` clauses to filter out null values +- With the `if()` function for conditional logic +- To validate data presence + +Argument type: all the supported data types. +Return type: BOOLEAN +Synonyms: [ISPRESENT](#ispresent) +Example + +```ppl +source=accounts +| eval has_employer = isnotnull(employer) +| fields firstname, employer, has_employer +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------+----------+--------------+ +| firstname | employer | has_employer | +|-----------+----------+--------------| +| Amber | Pyrami | True | +| Hattie | Netagy | True | +| Nanette | Quility | True | +| Dale | null | False | ++-----------+----------+--------------+ +``` + +Filtering with where clause + +```ppl +source=accounts +| where not isnotnull(employer) +| fields account_number, employer +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------+----------+ +| account_number | employer | +|----------------+----------| +| 18 | null | ++----------------+----------+ +``` + +Using with if() for validation messages + +```ppl +source=accounts +| eval validation = if(isnotnull(employer), 'valid', 'missing employer') +| fields firstname, employer, validation +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------+----------+------------------+ +| firstname | employer | validation | +|-----------+----------+------------------| +| Amber | Pyrami | valid | +| Hattie | Netagy | valid | +| Nanette | Quility | valid | +| Dale | null | missing employer | ++-----------+----------+------------------+ +``` + +## EXISTS + +[Since OpenSearch doesn't differentiate null and missing](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-exists-query.html), we can't provide functions like ismissing/isnotmissing to test if a field exists or not. But you can still use isnull/isnotnull for such purpose. +Example, the account 13 doesn't have email field + +```ppl +source=accounts +| where isnull(email) +| fields account_number, email +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------+-------+ +| account_number | email | +|----------------+-------| +| 13 | null | ++----------------+-------+ +``` + +## IFNULL + +### Description + +Usage: ifnull(field1, field2) returns field2 if field1 is null. +Argument type: all the supported data types (NOTE : if two parameters have different types, you will fail semantic check). +Return type: any +Example + +```ppl +source=accounts +| eval result = ifnull(employer, 'default') +| fields result, employer, firstname +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++---------+----------+-----------+ +| result | employer | firstname | +|---------+----------+-----------| +| Pyrami | Pyrami | Amber | +| Netagy | Netagy | Hattie | +| Quility | Quility | Nanette | +| default | null | Dale | ++---------+----------+-----------+ +``` + +### Nested IFNULL Pattern + +For OpenSearch versions prior to 3.1, COALESCE-like functionality can be achieved using nested IFNULL statements. This pattern is particularly useful in observability use cases where field names may vary across different data sources. +Usage: ifnull(field1, ifnull(field2, ifnull(field3, default_value))) +Example + +```ppl +source=accounts +| eval result = ifnull(employer, ifnull(firstname, ifnull(lastname, "unknown"))) +| fields result, employer, firstname, lastname +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++---------+----------+-----------+----------+ +| result | employer | firstname | lastname | +|---------+----------+-----------+----------| +| Pyrami | Pyrami | Amber | Duke | +| Netagy | Netagy | Hattie | Bond | +| Quility | Quility | Nanette | Bates | +| Dale | null | Dale | Adams | ++---------+----------+-----------+----------+ +``` + +## NULLIF + +### Description + +Usage: nullif(field1, field2) returns null if two parameters are same, otherwise returns field1. +Argument type: all the supported data types (NOTE : if two parameters have different types, you will fail semantic check). +Return type: any +Example + +```ppl +source=accounts +| eval result = nullif(employer, 'Pyrami') +| fields result, employer, firstname +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++---------+----------+-----------+ +| result | employer | firstname | +|---------+----------+-----------| +| null | Pyrami | Amber | +| Netagy | Netagy | Hattie | +| Quility | Quility | Nanette | +| null | null | Dale | ++---------+----------+-----------+ +``` + +## IF + +### Description + +Usage: if(condition, expr1, expr2) returns expr1 if condition is true, otherwise returns expr2. +Argument type: all the supported data types (NOTE : if expr1 and expr2 are different types, you will fail semantic check). +Return type: any +Example + +```ppl +source=accounts +| eval result = if(true, firstname, lastname) +| fields result, firstname, lastname +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++---------+-----------+----------+ +| result | firstname | lastname | +|---------+-----------+----------| +| Amber | Amber | Duke | +| Hattie | Hattie | Bond | +| Nanette | Nanette | Bates | +| Dale | Dale | Adams | ++---------+-----------+----------+ +``` + +```ppl +source=accounts +| eval result = if(false, firstname, lastname) +| fields result, firstname, lastname +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++--------+-----------+----------+ +| result | firstname | lastname | +|--------+-----------+----------| +| Duke | Amber | Duke | +| Bond | Hattie | Bond | +| Bates | Nanette | Bates | +| Adams | Dale | Adams | ++--------+-----------+----------+ +``` + +```ppl +source=accounts +| eval is_vip = if(age > 30 AND isnotnull(employer), true, false) +| fields is_vip, firstname, lastname +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++--------+-----------+----------+ +| is_vip | firstname | lastname | +|--------+-----------+----------| +| True | Amber | Duke | +| True | Hattie | Bond | +| False | Nanette | Bates | +| False | Dale | Adams | ++--------+-----------+----------+ +``` + +## CASE + +### Description + +Usage: case(condition1, expr1, condition2, expr2, ... conditionN, exprN else default) returns expr1 if condition1 is true, or returns expr2 if condition2 is true, ... if no condition is true, then returns the value of ELSE clause. If the ELSE clause is not defined, returns NULL. +Argument type: all the supported data types (NOTE : there is no comma before "else"). +Return type: any +### Limitations + +When each condition is a field comparison with a numeric literal and each result expression is a string literal, the query will be optimized as [range aggregations](https://docs.opensearch.org/latest/aggregations/bucket/range) if pushdown optimization is enabled. However, this optimization has the following limitations: +- Null values will not be grouped into any bucket of a range aggregation and will be ignored +- The default ELSE clause will use the string literal `"null"` instead of actual NULL values + +Example + +```ppl +source=accounts +| eval result = case(age > 35, firstname, age < 30, lastname else employer) +| fields result, firstname, lastname, age, employer +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++--------+-----------+----------+-----+----------+ +| result | firstname | lastname | age | employer | +|--------+-----------+----------+-----+----------| +| Pyrami | Amber | Duke | 32 | Pyrami | +| Hattie | Hattie | Bond | 36 | Netagy | +| Bates | Nanette | Bates | 28 | Quility | +| null | Dale | Adams | 33 | null | ++--------+-----------+----------+-----+----------+ +``` + +```ppl +source=accounts +| eval result = case(age > 35, firstname, age < 30, lastname) +| fields result, firstname, lastname, age +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++--------+-----------+----------+-----+ +| result | firstname | lastname | age | +|--------+-----------+----------+-----| +| null | Amber | Duke | 32 | +| Hattie | Hattie | Bond | 36 | +| Bates | Nanette | Bates | 28 | +| null | Dale | Adams | 33 | ++--------+-----------+----------+-----+ +``` + +```ppl +source=accounts +| where true = case(age > 35, false, age < 30, false else true) +| fields firstname, lastname, age +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++-----------+----------+-----+ +| firstname | lastname | age | +|-----------+----------+-----| +| Amber | Duke | 32 | +| Dale | Adams | 33 | ++-----------+----------+-----+ +``` + +## COALESCE + +### Description + +Usage: coalesce(field1, field2, ...) returns the first non-null, non-missing value in the argument list. +Argument type: all the supported data types. Supports mixed data types with automatic type coercion. +Return type: determined by the least restrictive common type among all arguments, with fallback to string if no common type can be determined +Behavior: +- Returns the first value that is not null and not missing (missing includes non-existent fields) +- Empty strings ("") and whitespace strings (" ") are considered valid values +- If all arguments are null or missing, returns null +- Automatic type coercion is applied to match the determined return type +- If type conversion fails, the value is converted to string representation +- For best results, use arguments of the same data type to avoid unexpected type conversions + +Performance Considerations: +- Optimized for multiple field evaluation, more efficient than nested IFNULL patterns +- Evaluates arguments sequentially, stopping at the first non-null value +- Consider field order based on likelihood of containing values to minimize evaluation overhead + +Limitations: +- Type coercion may result in unexpected string conversions for incompatible types +- Performance may degrade with very large numbers of arguments + +Example + +```ppl +source=accounts +| eval result = coalesce(employer, firstname, lastname) +| fields result, firstname, lastname, employer +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++---------+-----------+----------+----------+ +| result | firstname | lastname | employer | +|---------+-----------+----------+----------| +| Pyrami | Amber | Duke | Pyrami | +| Netagy | Hattie | Bond | Netagy | +| Quility | Nanette | Bates | Quility | +| Dale | Dale | Adams | null | ++---------+-----------+----------+----------+ +``` + +Empty String Handling Examples + +```ppl +source=accounts +| eval empty_field = "" +| eval result = coalesce(empty_field, firstname) +| fields result, empty_field, firstname +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++--------+-------------+-----------+ +| result | empty_field | firstname | +|--------+-------------+-----------| +| | | Amber | +| | | Hattie | +| | | Nanette | +| | | Dale | ++--------+-------------+-----------+ +``` + +```ppl +source=accounts +| eval result = coalesce(" ", firstname) +| fields result, firstname +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++--------+-----------+ +| result | firstname | +|--------+-----------| +| | Amber | +| | Hattie | +| | Nanette | +| | Dale | ++--------+-----------+ +``` + +Mixed Data Types with Auto Coercion + +```ppl +source=accounts +| eval result = coalesce(employer, balance, "fallback") +| fields result, employer, balance +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++---------+----------+---------+ +| result | employer | balance | +|---------+----------+---------| +| Pyrami | Pyrami | 39225 | +| Netagy | Netagy | 5686 | +| Quility | Quility | 32838 | +| 4180 | null | 4180 | ++---------+----------+---------+ +``` + +Non-existent Field Handling + +```ppl +source=accounts +| eval result = coalesce(nonexistent_field, firstname, "unknown") +| fields result, firstname +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++---------+-----------+ +| result | firstname | +|---------+-----------| +| Amber | Amber | +| Hattie | Hattie | +| Nanette | Nanette | +| Dale | Dale | ++---------+-----------+ +``` + +## ISPRESENT + +### Description + +Usage: ispresent(field) returns true if the field exists. +Argument type: all the supported data types. +Return type: BOOLEAN +Synonyms: [ISNOTNULL](#isnotnull) +Example + +```ppl +source=accounts +| where ispresent(employer) +| fields employer, firstname +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++----------+-----------+ +| employer | firstname | +|----------+-----------| +| Pyrami | Amber | +| Netagy | Hattie | +| Quility | Nanette | ++----------+-----------+ +``` + +## ISBLANK + +### Description + +Usage: isblank(field) returns true if the field is null, an empty string, or contains only white space. +Argument type: all the supported data types. +Return type: BOOLEAN +Example + +```ppl +source=accounts +| eval temp = ifnull(employer, ' ') +| eval `isblank(employer)` = isblank(employer), `isblank(temp)` = isblank(temp) +| fields `isblank(temp)`, temp, `isblank(employer)`, employer +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++---------------+---------+-------------------+----------+ +| isblank(temp) | temp | isblank(employer) | employer | +|---------------+---------+-------------------+----------| +| False | Pyrami | False | Pyrami | +| False | Netagy | False | Netagy | +| False | Quility | False | Quility | +| True | | True | null | ++---------------+---------+-------------------+----------+ +``` + +## ISEMPTY + +### Description + +Usage: isempty(field) returns true if the field is null or is an empty string. +Argument type: all the supported data types. +Return type: BOOLEAN +Example + +```ppl +source=accounts +| eval temp = ifnull(employer, ' ') +| eval `isempty(employer)` = isempty(employer), `isempty(temp)` = isempty(temp) +| fields `isempty(temp)`, temp, `isempty(employer)`, employer +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++---------------+---------+-------------------+----------+ +| isempty(temp) | temp | isempty(employer) | employer | +|---------------+---------+-------------------+----------| +| False | Pyrami | False | Pyrami | +| False | Netagy | False | Netagy | +| False | Quility | False | Quility | +| False | | True | null | ++---------------+---------+-------------------+----------+ +``` + +## EARLIEST + +### Description + +Usage: earliest(relative_string, field) returns true if the value of field is after the timestamp derived from relative_string relative to the current time. Otherwise, returns false. +relative_string: +The relative string can be one of the following formats: +1. `"now"` or `"now()"`: + + Uses the current system time. +2. Absolute format (`MM/dd/yyyy:HH:mm:ss` or `yyyy-MM-dd HH:mm:ss`): + + Converts the string to a timestamp and compares it with the data. +3. Relative format: `(+|-)[+<...>]@` + + Steps to specify a relative time: + - **a. Time offset:** Indicate the offset from the current time using `+` or `-`. + - **b. Time amount:** Provide a numeric value followed by a time unit (`s`, `m`, `h`, `d`, `w`, `M`, `y`). + - **c. Snap to unit:** Optionally specify a snap unit with `@` to round the result down to the nearest unit (e.g., hour, day, month). + + **Examples** (assuming current time is `2025-05-28 14:28:34`): + - `-3d+2y` → `2027-05-25 14:28:34` + - `+1d@m` → `2025-05-29 14:28:00` + - `-3M+1y@M` → `2026-02-01 00:00:00` + +Read more details [here](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/ppl-lang/functions/ppl-datetime.md#relative_timestamp) +Argument type: relative_string:STRING, field: TIMESTAMP +Return type: BOOLEAN +Example + +```ppl +source=accounts +| eval now = utc_timestamp() +| eval a = earliest("now", now), b = earliest("-2d@d", now) +| fields a, b +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------+------+ +| a | b | +|-------+------| +| False | True | ++-------+------+ +``` + +```ppl +source=nyc_taxi +| where earliest('07/01/2014:00:30:00', timestamp) +| stats COUNT() as cnt +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----+ +| cnt | +|-----| +| 972 | ++-----+ +``` + +## LATEST + +### Description + +Usage: latest(relative_string, field) returns true if the value of field is before the timestamp derived from relative_string relative to the current time. Otherwise, returns false. +Argument type: relative_string:STRING, field: TIMESTAMP +Return type: BOOLEAN +Example + +```ppl +source=accounts +| eval now = utc_timestamp() +| eval a = latest("now", now), b = latest("+2d@d", now) +| fields a, b +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++------+------+ +| a | b | +|------+------| +| True | True | ++------+------+ +``` + +```ppl +source=nyc_taxi +| where latest('07/21/2014:04:00:00', timestamp) +| stats COUNT() as cnt +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----+ +| cnt | +|-----| +| 969 | ++-----+ +``` + +## REGEXP_MATCH + +### Description + +Usage: regexp_match(string, pattern) returns true if the regular expression pattern finds a match against any substring of the string value, otherwise returns false. +The function uses Java regular expression syntax for the pattern. +Argument type: STRING, STRING +Return type: BOOLEAN +Example + +``` ppl ignore +source=logs | where regexp_match(message, 'ERROR|WARN|FATAL') | fields timestamp, message +``` + +```text +fetched rows / total rows = 3/100 ++---------------------+------------------------------------------+ +| timestamp | message | +|---------------------+------------------------------------------| +| 2024-01-15 10:23:45 | ERROR: Connection timeout to database | +| 2024-01-15 10:24:12 | WARN: High memory usage detected | +| 2024-01-15 10:25:33 | FATAL: System crashed unexpectedly | ++---------------------+------------------------------------------+ +``` + +``` ppl ignore +source=users | where regexp_match(email, '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}') | fields name, email +``` + +```text +fetched rows / total rows = 2/3 ++-------+----------------------+ +| name | email | +|-------+----------------------| +| John | john@example.com | +| Alice | alice@company.org | ++-------+----------------------+ +``` + +```ppl ignore +source=network | where regexp_match(ip_address, '^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$') AND NOT regexp_match(ip_address, '^(10\.|172\.(1[6-9]|2[0-9]|3[01])\.|192\.168\.)') | fields ip_address, status +``` + +```text +fetched rows / total rows = 2/10 ++---------------+--------+ +| ip_address | status | +|---------------+--------| +| 8.8.8.8 | active | +| 1.1.1.1 | active | ++---------------+--------+ +``` + +```ppl ignore +source=products | eval category = if(regexp_match(name, '(?i)(laptop|computer|desktop)'), 'Computing', if(regexp_match(name, '(?i)(phone|tablet|mobile)'), 'Mobile', 'Other')) | fields name, category +``` + +```text +fetched rows / total rows = 4/4 ++------------------------+----------+ +| name | category | +|------------------------+----------| +| Dell Laptop XPS | Computing| +| iPhone 15 Pro | Mobile | +| Wireless Mouse | Other | +| Desktop Computer Tower | Computing| ++------------------------+----------+ +``` \ No newline at end of file diff --git a/docs/user/ppl/functions/condition.rst b/docs/user/ppl/functions/condition.rst deleted file mode 100644 index 0364fa7b116..00000000000 --- a/docs/user/ppl/functions/condition.rst +++ /dev/null @@ -1,615 +0,0 @@ -=================== -Condition Functions -=================== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 1 - -ISNULL ------- - -Description ->>>>>>>>>>> - -Usage: isnull(field) returns TRUE if field is NULL, FALSE otherwise. - -The `isnull()` function is commonly used: -- In `eval` expressions to create conditional fields -- With the `if()` function to provide default values -- In `where` clauses to filter null records - -Argument type: all the supported data types. - -Return type: BOOLEAN - -Example:: - - os> source=accounts | eval result = isnull(employer) | fields result, employer, firstname - fetched rows / total rows = 4/4 - +--------+----------+-----------+ - | result | employer | firstname | - |--------+----------+-----------| - | False | Pyrami | Amber | - | False | Netagy | Hattie | - | False | Quility | Nanette | - | True | null | Dale | - +--------+----------+-----------+ - -Using with if() to label records:: - - os> source=accounts | eval status = if(isnull(employer), 'unemployed', 'employed') | fields firstname, employer, status - fetched rows / total rows = 4/4 - +-----------+----------+------------+ - | firstname | employer | status | - |-----------+----------+------------| - | Amber | Pyrami | employed | - | Hattie | Netagy | employed | - | Nanette | Quility | employed | - | Dale | null | unemployed | - +-----------+----------+------------+ - -Filtering with where clause:: - - os> source=accounts | where isnull(employer) | fields account_number, firstname, employer - fetched rows / total rows = 1/1 - +----------------+-----------+----------+ - | account_number | firstname | employer | - |----------------+-----------+----------| - | 18 | Dale | null | - +----------------+-----------+----------+ - -ISNOTNULL ---------- - -Description ->>>>>>>>>>> - -Usage: isnotnull(field) returns TRUE if field is NOT NULL, FALSE otherwise. - -The `isnotnull()` function is commonly used: -- In `eval` expressions to create boolean flags -- In `where` clauses to filter out null values -- With the `if()` function for conditional logic -- To validate data presence - -Argument type: all the supported data types. - -Return type: BOOLEAN - -Synonyms: `ISPRESENT`_ - -Example:: - - os> source=accounts | eval has_employer = isnotnull(employer) | fields firstname, employer, has_employer - fetched rows / total rows = 4/4 - +-----------+----------+--------------+ - | firstname | employer | has_employer | - |-----------+----------+--------------| - | Amber | Pyrami | True | - | Hattie | Netagy | True | - | Nanette | Quility | True | - | Dale | null | False | - +-----------+----------+--------------+ - -Filtering with where clause:: - - os> source=accounts | where not isnotnull(employer) | fields account_number, employer - fetched rows / total rows = 1/1 - +----------------+----------+ - | account_number | employer | - |----------------+----------| - | 18 | null | - +----------------+----------+ - -Using with if() for validation messages:: - - os> source=accounts | eval validation = if(isnotnull(employer), 'valid', 'missing employer') | fields firstname, employer, validation - fetched rows / total rows = 4/4 - +-----------+----------+------------------+ - | firstname | employer | validation | - |-----------+----------+------------------| - | Amber | Pyrami | valid | - | Hattie | Netagy | valid | - | Nanette | Quility | valid | - | Dale | null | missing employer | - +-----------+----------+------------------+ - -EXISTS ------- - -`Since OpenSearch doesn't differentiate null and missing `_, we can't provide functions like ismissing/isnotmissing to test if a field exists or not. But you can still use isnull/isnotnull for such purpose. - -Example, the account 13 doesn't have email field:: - - os> source=accounts | where isnull(email) | fields account_number, email - fetched rows / total rows = 1/1 - +----------------+-------+ - | account_number | email | - |----------------+-------| - | 13 | null | - +----------------+-------+ - -IFNULL ------- - -Description ->>>>>>>>>>> - -Usage: ifnull(field1, field2) returns field2 if field1 is null. - -Argument type: all the supported data types (NOTE : if two parameters have different types, you will fail semantic check). - -Return type: any - -Example:: - - os> source=accounts | eval result = ifnull(employer, 'default') | fields result, employer, firstname - fetched rows / total rows = 4/4 - +---------+----------+-----------+ - | result | employer | firstname | - |---------+----------+-----------| - | Pyrami | Pyrami | Amber | - | Netagy | Netagy | Hattie | - | Quility | Quility | Nanette | - | default | null | Dale | - +---------+----------+-----------+ - -Nested IFNULL Pattern ->>>>>>>>>>>>>>>>>>>>> - -For OpenSearch versions prior to 3.1, COALESCE-like functionality can be achieved using nested IFNULL statements. This pattern is particularly useful in observability use cases where field names may vary across different data sources. - -Usage: ifnull(field1, ifnull(field2, ifnull(field3, default_value))) - -Example:: - - os> source=accounts | eval result = ifnull(employer, ifnull(firstname, ifnull(lastname, "unknown"))) | fields result, employer, firstname, lastname - fetched rows / total rows = 4/4 - +---------+----------+-----------+----------+ - | result | employer | firstname | lastname | - |---------+----------+-----------+----------| - | Pyrami | Pyrami | Amber | Duke | - | Netagy | Netagy | Hattie | Bond | - | Quility | Quility | Nanette | Bates | - | Dale | null | Dale | Adams | - +---------+----------+-----------+----------+ - -NULLIF ------- - -Description ->>>>>>>>>>> - -Usage: nullif(field1, field2) returns null if two parameters are same, otherwise returns field1. - -Argument type: all the supported data types (NOTE : if two parameters have different types, you will fail semantic check). - -Return type: any - -Example:: - - os> source=accounts | eval result = nullif(employer, 'Pyrami') | fields result, employer, firstname - fetched rows / total rows = 4/4 - +---------+----------+-----------+ - | result | employer | firstname | - |---------+----------+-----------| - | null | Pyrami | Amber | - | Netagy | Netagy | Hattie | - | Quility | Quility | Nanette | - | null | null | Dale | - +---------+----------+-----------+ - -IF ------- - -Description ->>>>>>>>>>> - -Usage: if(condition, expr1, expr2) returns expr1 if condition is true, otherwise returns expr2. - -Argument type: all the supported data types (NOTE : if expr1 and expr2 are different types, you will fail semantic check). - -Return type: any - -Example:: - - os> source=accounts | eval result = if(true, firstname, lastname) | fields result, firstname, lastname - fetched rows / total rows = 4/4 - +---------+-----------+----------+ - | result | firstname | lastname | - |---------+-----------+----------| - | Amber | Amber | Duke | - | Hattie | Hattie | Bond | - | Nanette | Nanette | Bates | - | Dale | Dale | Adams | - +---------+-----------+----------+ - - os> source=accounts | eval result = if(false, firstname, lastname) | fields result, firstname, lastname - fetched rows / total rows = 4/4 - +--------+-----------+----------+ - | result | firstname | lastname | - |--------+-----------+----------| - | Duke | Amber | Duke | - | Bond | Hattie | Bond | - | Bates | Nanette | Bates | - | Adams | Dale | Adams | - +--------+-----------+----------+ - - os> source=accounts | eval is_vip = if(age > 30 AND isnotnull(employer), true, false) | fields is_vip, firstname, lastname - fetched rows / total rows = 4/4 - +--------+-----------+----------+ - | is_vip | firstname | lastname | - |--------+-----------+----------| - | True | Amber | Duke | - | True | Hattie | Bond | - | False | Nanette | Bates | - | False | Dale | Adams | - +--------+-----------+----------+ - -CASE ------- - -Description ->>>>>>>>>>> - -Usage: case(condition1, expr1, condition2, expr2, ... conditionN, exprN else default) returns expr1 if condition1 is true, or returns expr2 if condition2 is true, ... if no condition is true, then returns the value of ELSE clause. If the ELSE clause is not defined, returns NULL. - -Argument type: all the supported data types (NOTE : there is no comma before "else"). - -Return type: any - -Limitations ->>>>>>>>>>> - -When each condition is a field comparison with a numeric literal and each result expression is a string literal, the query will be optimized as `range aggregations `_ if pushdown optimization is enabled. However, this optimization has the following limitations: - -- Null values will not be grouped into any bucket of a range aggregation and will be ignored -- The default ELSE clause will use the string literal ``"null"`` instead of actual NULL values - -Example:: - - os> source=accounts | eval result = case(age > 35, firstname, age < 30, lastname else employer) | fields result, firstname, lastname, age, employer - fetched rows / total rows = 4/4 - +--------+-----------+----------+-----+----------+ - | result | firstname | lastname | age | employer | - |--------+-----------+----------+-----+----------| - | Pyrami | Amber | Duke | 32 | Pyrami | - | Hattie | Hattie | Bond | 36 | Netagy | - | Bates | Nanette | Bates | 28 | Quility | - | null | Dale | Adams | 33 | null | - +--------+-----------+----------+-----+----------+ - - os> source=accounts | eval result = case(age > 35, firstname, age < 30, lastname) | fields result, firstname, lastname, age - fetched rows / total rows = 4/4 - +--------+-----------+----------+-----+ - | result | firstname | lastname | age | - |--------+-----------+----------+-----| - | null | Amber | Duke | 32 | - | Hattie | Hattie | Bond | 36 | - | Bates | Nanette | Bates | 28 | - | null | Dale | Adams | 33 | - +--------+-----------+----------+-----+ - - os> source=accounts | where true = case(age > 35, false, age < 30, false else true) | fields firstname, lastname, age - fetched rows / total rows = 2/2 - +-----------+----------+-----+ - | firstname | lastname | age | - |-----------+----------+-----| - | Amber | Duke | 32 | - | Dale | Adams | 33 | - +-----------+----------+-----+ - -COALESCE --------- - -Description ->>>>>>>>>>> - -Usage: coalesce(field1, field2, ...) returns the first non-null, non-missing value in the argument list. - -Argument type: all the supported data types. Supports mixed data types with automatic type coercion. - -Return type: determined by the least restrictive common type among all arguments, with fallback to string if no common type can be determined - -Behavior: - -- Returns the first value that is not null and not missing (missing includes non-existent fields) -- Empty strings ("") and whitespace strings (" ") are considered valid values -- If all arguments are null or missing, returns null -- Automatic type coercion is applied to match the determined return type -- If type conversion fails, the value is converted to string representation -- For best results, use arguments of the same data type to avoid unexpected type conversions - -Performance Considerations: - -- Optimized for multiple field evaluation, more efficient than nested IFNULL patterns -- Evaluates arguments sequentially, stopping at the first non-null value -- Consider field order based on likelihood of containing values to minimize evaluation overhead - -Limitations: - -- Type coercion may result in unexpected string conversions for incompatible types -- Performance may degrade with very large numbers of arguments - -Example:: - - os> source=accounts | eval result = coalesce(employer, firstname, lastname) | fields result, firstname, lastname, employer - fetched rows / total rows = 4/4 - +---------+-----------+----------+----------+ - | result | firstname | lastname | employer | - |---------+-----------+----------+----------| - | Pyrami | Amber | Duke | Pyrami | - | Netagy | Hattie | Bond | Netagy | - | Quility | Nanette | Bates | Quility | - | Dale | Dale | Adams | null | - +---------+-----------+----------+----------+ - -Empty String Handling Examples:: - - os> source=accounts | eval empty_field = "" | eval result = coalesce(empty_field, firstname) | fields result, empty_field, firstname - fetched rows / total rows = 4/4 - +--------+-------------+-----------+ - | result | empty_field | firstname | - |--------+-------------+-----------| - | | | Amber | - | | | Hattie | - | | | Nanette | - | | | Dale | - +--------+-------------+-----------+ - - os> source=accounts | eval result = coalesce(" ", firstname) | fields result, firstname - fetched rows / total rows = 4/4 - +--------+-----------+ - | result | firstname | - |--------+-----------| - | | Amber | - | | Hattie | - | | Nanette | - | | Dale | - +--------+-----------+ - -Mixed Data Types with Auto Coercion:: - - os> source=accounts | eval result = coalesce(employer, balance, "fallback") | fields result, employer, balance - fetched rows / total rows = 4/4 - +---------+----------+---------+ - | result | employer | balance | - |---------+----------+---------| - | Pyrami | Pyrami | 39225 | - | Netagy | Netagy | 5686 | - | Quility | Quility | 32838 | - | 4180 | null | 4180 | - +---------+----------+---------+ - -Non-existent Field Handling:: - - os> source=accounts | eval result = coalesce(nonexistent_field, firstname, "unknown") | fields result, firstname - fetched rows / total rows = 4/4 - +---------+-----------+ - | result | firstname | - |---------+-----------| - | Amber | Amber | - | Hattie | Hattie | - | Nanette | Nanette | - | Dale | Dale | - +---------+-----------+ - - -ISPRESENT ---------- - -Description ->>>>>>>>>>> - -Usage: ispresent(field) returns true if the field exists. - -Argument type: all the supported data types. - -Return type: BOOLEAN - -Synonyms: `ISNOTNULL`_ - -Example:: - - os> source=accounts | where ispresent(employer) | fields employer, firstname - fetched rows / total rows = 3/3 - +----------+-----------+ - | employer | firstname | - |----------+-----------| - | Pyrami | Amber | - | Netagy | Hattie | - | Quility | Nanette | - +----------+-----------+ - -ISBLANK -------- - -Description ->>>>>>>>>>> - -Usage: isblank(field) returns true if the field is null, an empty string, or contains only white space. - -Argument type: all the supported data types. - -Return type: BOOLEAN - -Example:: - - os> source=accounts | eval temp = ifnull(employer, ' ') | eval `isblank(employer)` = isblank(employer), `isblank(temp)` = isblank(temp) | fields `isblank(temp)`, temp, `isblank(employer)`, employer - fetched rows / total rows = 4/4 - +---------------+---------+-------------------+----------+ - | isblank(temp) | temp | isblank(employer) | employer | - |---------------+---------+-------------------+----------| - | False | Pyrami | False | Pyrami | - | False | Netagy | False | Netagy | - | False | Quility | False | Quility | - | True | | True | null | - +---------------+---------+-------------------+----------+ - - -ISEMPTY -------- - -Description ->>>>>>>>>>> - -Usage: isempty(field) returns true if the field is null or is an empty string. - -Argument type: all the supported data types. - -Return type: BOOLEAN - -Example:: - - os> source=accounts | eval temp = ifnull(employer, ' ') | eval `isempty(employer)` = isempty(employer), `isempty(temp)` = isempty(temp) | fields `isempty(temp)`, temp, `isempty(employer)`, employer - fetched rows / total rows = 4/4 - +---------------+---------+-------------------+----------+ - | isempty(temp) | temp | isempty(employer) | employer | - |---------------+---------+-------------------+----------| - | False | Pyrami | False | Pyrami | - | False | Netagy | False | Netagy | - | False | Quility | False | Quility | - | False | | True | null | - +---------------+---------+-------------------+----------+ - -EARLIEST --------- - -Description ->>>>>>>>>>> - -Usage: earliest(relative_string, field) returns true if the value of field is after the timestamp derived from relative_string relative to the current time. Otherwise, returns false. - -relative_string: -The relative string can be one of the following formats: - -1. `"now"` or `"now()"`: - Uses the current system time. - -2. Absolute format (`MM/dd/yyyy:HH:mm:ss` or `yyyy-MM-dd HH:mm:ss`): - Converts the string to a timestamp and compares it with the data. - -3. Relative format: `(+|-)[+<...>]@` - Steps to specify a relative time: - - - **a. Time offset:** Indicate the offset from the current time using `+` or `-`. - - **b. Time amount:** Provide a numeric value followed by a time unit (`s`, `m`, `h`, `d`, `w`, `M`, `y`). - - **c. Snap to unit:** Optionally specify a snap unit with `@` to round the result down to the nearest unit (e.g., hour, day, month). - - **Examples** (assuming current time is `2025-05-28 14:28:34`): - - - `-3d+2y` → `2027-05-25 14:28:34` - - `+1d@m` → `2025-05-29 14:28:00` - - `-3M+1y@M` → `2026-02-01 00:00:00` - -Read more details `here `_ - -Argument type: relative_string:STRING, field: TIMESTAMP - -Return type: BOOLEAN - -Example:: - - os> source=accounts | eval now = utc_timestamp() | eval a = earliest("now", now), b = earliest("-2d@d", now) | fields a, b | head 1 - fetched rows / total rows = 1/1 - +-------+------+ - | a | b | - |-------+------| - | False | True | - +-------+------+ - - os> source=nyc_taxi | where earliest('07/01/2014:00:30:00', timestamp) | stats COUNT() as cnt - fetched rows / total rows = 1/1 - +-----+ - | cnt | - |-----| - | 972 | - +-----+ - -LATEST ------- - -Description ->>>>>>>>>>> - -Usage: latest(relative_string, field) returns true if the value of field is before the timestamp derived from relative_string relative to the current time. Otherwise, returns false. - -Argument type: relative_string:STRING, field: TIMESTAMP - -Return type: BOOLEAN - -Example:: - - os> source=accounts | eval now = utc_timestamp() | eval a = latest("now", now), b = latest("+2d@d", now) | fields a, b | head 1 - fetched rows / total rows = 1/1 - +------+------+ - | a | b | - |------+------| - | True | True | - +------+------+ - - os> source=nyc_taxi | where latest('07/21/2014:04:00:00', timestamp) | stats COUNT() as cnt - fetched rows / total rows = 1/1 - +-----+ - | cnt | - |-----| - | 969 | - +-----+ - -REGEXP_MATCH ------------ - -Description ->>>>>>>>>>> - -Usage: regexp_match(string, pattern) returns true if the regular expression pattern finds a match against any substring of the string value, otherwise returns false. - -The function uses Java regular expression syntax for the pattern. - -Argument type: STRING, STRING - -Return type: BOOLEAN - -Example:: - - #os> source=logs | where regexp_match(message, 'ERROR|WARN|FATAL') | fields timestamp, message - fetched rows / total rows = 3/100 - +---------------------+------------------------------------------+ - | timestamp | message | - |---------------------+------------------------------------------| - | 2024-01-15 10:23:45 | ERROR: Connection timeout to database | - | 2024-01-15 10:24:12 | WARN: High memory usage detected | - | 2024-01-15 10:25:33 | FATAL: System crashed unexpectedly | - +---------------------+------------------------------------------+ - - #os> source=users | where regexp_match(email, '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}') | fields name, email - fetched rows / total rows = 2/3 - +-------+----------------------+ - | name | email | - |-------+----------------------| - | John | john@example.com | - | Alice | alice@company.org | - +-------+----------------------+ - - #os> source=network | where regexp_match(ip_address, '^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$') AND NOT regexp_match(ip_address, '^(10\.|172\.(1[6-9]|2[0-9]|3[01])\.|192\.168\.)') | fields ip_address, status - fetched rows / total rows = 2/10 - +---------------+--------+ - | ip_address | status | - |---------------+--------| - | 8.8.8.8 | active | - | 1.1.1.1 | active | - +---------------+--------+ - - #os> source=products | eval category = if(regexp_match(name, '(?i)(laptop|computer|desktop)'), 'Computing', if(regexp_match(name, '(?i)(phone|tablet|mobile)'), 'Mobile', 'Other')) | fields name, category - fetched rows / total rows = 4/4 - +------------------------+----------+ - | name | category | - |------------------------+----------| - | Dell Laptop XPS | Computing| - | iPhone 15 Pro | Mobile | - | Wireless Mouse | Other | - | Desktop Computer Tower | Computing| - +------------------------+----------+ diff --git a/docs/user/ppl/functions/conversion.md b/docs/user/ppl/functions/conversion.md new file mode 100644 index 00000000000..a33a93bbd69 --- /dev/null +++ b/docs/user/ppl/functions/conversion.md @@ -0,0 +1,272 @@ +# Type Conversion Functions + +## CAST + +### Description + +Usage: cast(expr as dateType) cast the expr to dataType. return the value of dataType. The following conversion rules are used: + +| Src/Target | STRING | NUMBER | BOOLEAN | TIMESTAMP | DATE | TIME | IP | +| --- | --- | --- | --- | --- | --- | --- | --- | +| STRING | | Note1 | Note1 | TIMESTAMP() | DATE() | TIME() | IP() | +| NUMBER | Note1 | | v!=0 | N/A | N/A | N/A | N/A | +| BOOLEAN | Note1 | v?1:0 | | N/A | N/A | N/A | N/A | +| TIMESTAMP | Note1 | N/A | N/A | | DATE() | TIME() | N/A | +| DATE | Note1 | N/A | N/A | N/A | | N/A | N/A | +| TIME | Note1 | N/A | N/A | N/A | N/A | | N/A | +| IP | Note2 | N/A | N/A | N/A | N/A | N/A | | + +Note1: the conversion follow the JDK specification. +Note2: IP will be converted to its canonical representation. Canonical representation +for IPv6 is described in [RFC 5952](https://datatracker.ietf.org/doc/html/rfc5952). +Cast to string example + +```ppl +source=people +| eval `cbool` = CAST(true as string), `cint` = CAST(1 as string), `cdate` = CAST(CAST('2012-08-07' as date) as string) +| fields `cbool`, `cint`, `cdate` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------+------+------------+ +| cbool | cint | cdate | +|-------+------+------------| +| TRUE | 1 | 2012-08-07 | ++-------+------+------------+ +``` + +Cast to number example + +```ppl +source=people +| eval `cbool` = CAST(true as int), `cstring` = CAST('1' as int) +| fields `cbool`, `cstring` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------+---------+ +| cbool | cstring | +|-------+---------| +| 1 | 1 | ++-------+---------+ +``` + +Cast to date example + +```ppl +source=people +| eval `cdate` = CAST('2012-08-07' as date), `ctime` = CAST('01:01:01' as time), `ctimestamp` = CAST('2012-08-07 01:01:01' as timestamp) +| fields `cdate`, `ctime`, `ctimestamp` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++------------+----------+---------------------+ +| cdate | ctime | ctimestamp | +|------------+----------+---------------------| +| 2012-08-07 | 01:01:01 | 2012-08-07 01:01:01 | ++------------+----------+---------------------+ +``` + +Cast function can be chained + +```ppl +source=people +| eval `cbool` = CAST(CAST(true as string) as boolean) +| fields `cbool` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------+ +| cbool | +|-------| +| True | ++-------+ +``` + +## IMPLICIT (AUTO) TYPE CONVERSION + +Implicit conversion is automatic casting. When a function does not have an exact match for the +input types, the engine looks for another signature that can safely work with the values. It picks +the option that requires the least stretching of the original types, so you can mix literals and +fields without adding `CAST` everywhere. +### String to numeric + +When a string stands in for a number we simply parse the text: +- The value must be something like `"3.14"` or `"42"`. Anything else causes the query to fail. +- If a string appears next to numeric arguments, it is treated as a `DOUBLE` so the numeric + + overload of the function can run. +Use string in arithmetic operator example + +```ppl +source=people +| eval divide="5"/10, multiply="5" * 10, add="5" + 10, minus="5" - 10, concat="5" + "5" +| fields divide, multiply, add, minus, concat +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------+----------+------+-------+--------+ +| divide | multiply | add | minus | concat | +|--------+----------+------+-------+--------| +| 0.5 | 50.0 | 15.0 | -5.0 | 55 | ++--------+----------+------+-------+--------+ +``` + +Use string in comparison operator example + +```ppl +source=people +| eval e="1000"==1000, en="1000"!=1000, ed="1000"==1000.0, edn="1000"!=1000.0, l="1000">999, ld="1000">999.9, i="malformed"==1000 +| fields e, en, ed, edn, l, ld, i +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++------+-------+------+-------+------+------+------+ +| e | en | ed | edn | l | ld | i | +|------+-------+------+-------+------+------+------| +| True | False | True | False | True | True | null | ++------+-------+------+-------+------+------+------+ +``` + +## TOSTRING + +### Description + +The following usage options are available, depending on the parameter types and the number of parameters. +Usage with format type: tostring(ANY, [format]): Converts the value in first argument to provided format type string in second argument. If second argument is not provided, then it converts to default string representation. +Return type: string +Usage for boolean parameter without format type tostring(boolean): Converts the string to 'TRUE' or 'FALSE'. +Return type: string +You can use this function with the eval commands and as part of eval expressions. If first argument can be any valid type , second argument is optional and if provided , it needs to be format name to convert to where first argument contains only numbers. If first argument is boolean, then second argument is not used even if its provided. +Format types: +1. "binary" Converts a number to a binary value. +2. "hex" Converts the number to a hexadecimal value. +3. "commas" Formats the number with commas. If the number includes a decimal, the function rounds the number to nearest two decimal places. +4. "duration" Converts the value in seconds to the readable time format HH:MM:SS. +5. "duration_millis" Converts the value in milliseconds to the readable time format HH:MM:SS. + +The format argument is optional and is only used when the value argument is a number. The tostring function supports the following formats. +Basic examples: +You can use this function to convert a number to a string of its binary representation. +Example + +```ppl +source=accounts +| where firstname = "Amber" +| eval balance_binary = tostring(balance, "binary") +| fields firstname, balance_binary, balance +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------+------------------+---------+ +| firstname | balance_binary | balance | +|-----------+------------------+---------| +| Amber | 1001100100111001 | 39225 | ++-----------+------------------+---------+ +``` + +You can use this function to convert a number to a string of its hex representation. +Example + +```ppl +source=accounts +| where firstname = "Amber" +| eval balance_hex = tostring(balance, "hex") +| fields firstname, balance_hex, balance +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------+-------------+---------+ +| firstname | balance_hex | balance | +|-----------+-------------+---------| +| Amber | 9939 | 39225 | ++-----------+-------------+---------+ +``` + +The following example formats the column totalSales to display values with commas. +Example + +```ppl +source=accounts +| where firstname = "Amber" +| eval balance_commas = tostring(balance, "commas") +| fields firstname, balance_commas, balance +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------+----------------+---------+ +| firstname | balance_commas | balance | +|-----------+----------------+---------| +| Amber | 39,225 | 39225 | ++-----------+----------------+---------+ +``` + +The following example converts number of seconds to HH:MM:SS format representing hours, minutes and seconds. +Example + +```ppl +source=accounts +| where firstname = "Amber" +| eval duration = tostring(6500, "duration") +| fields firstname, duration +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------+----------+ +| firstname | duration | +|-----------+----------| +| Amber | 01:48:20 | ++-----------+----------+ +``` + +The following example for converts boolean parameter to string. +Example + +```ppl +source=accounts +| where firstname = "Amber" +| eval `boolean_str` = tostring(1=1) +| fields `boolean_str` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------+ +| boolean_str | +|-------------| +| TRUE | ++-------------+ +``` + \ No newline at end of file diff --git a/docs/user/ppl/functions/conversion.rst b/docs/user/ppl/functions/conversion.rst deleted file mode 100644 index 82d760cc3ce..00000000000 --- a/docs/user/ppl/functions/conversion.rst +++ /dev/null @@ -1,203 +0,0 @@ -========================= -Type Conversion Functions -========================= - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 1 - -CAST ----- - -Description ->>>>>>>>>>> - -Usage: cast(expr as dateType) cast the expr to dataType. return the value of dataType. The following conversion rules are used: - -+------------+--------+--------+---------+-------------+--------+--------+--------+ -| Src/Target | STRING | NUMBER | BOOLEAN | TIMESTAMP | DATE | TIME | IP | -+------------+--------+--------+---------+-------------+--------+--------+--------+ -| STRING | | Note1 | Note1 | TIMESTAMP() | DATE() | TIME() | IP() | -+------------+--------+--------+---------+-------------+--------+--------+--------+ -| NUMBER | Note1 | | v!=0 | N/A | N/A | N/A | N/A | -+------------+--------+--------+---------+-------------+--------+--------+--------+ -| BOOLEAN | Note1 | v?1:0 | | N/A | N/A | N/A | N/A | -+------------+--------+--------+---------+-------------+--------+--------+--------+ -| TIMESTAMP | Note1 | N/A | N/A | | DATE() | TIME() | N/A | -+------------+--------+--------+---------+-------------+--------+--------+--------+ -| DATE | Note1 | N/A | N/A | N/A | | N/A | N/A | -+------------+--------+--------+---------+-------------+--------+--------+--------+ -| TIME | Note1 | N/A | N/A | N/A | N/A | | N/A | -+------------+--------+--------+---------+-------------+--------+--------+--------+ -| IP | Note2 | N/A | N/A | N/A | N/A | N/A | | -+------------+--------+--------+---------+-------------+--------+--------+--------+ - -Note1: the conversion follow the JDK specification. - -Note2: IP will be converted to its canonical representation. Canonical representation -for IPv6 is described in `RFC 5952 `_. - -Cast to string example:: - - os> source=people | eval `cbool` = CAST(true as string), `cint` = CAST(1 as string), `cdate` = CAST(CAST('2012-08-07' as date) as string) | fields `cbool`, `cint`, `cdate` - fetched rows / total rows = 1/1 - +-------+------+------------+ - | cbool | cint | cdate | - |-------+------+------------| - | TRUE | 1 | 2012-08-07 | - +-------+------+------------+ - -Cast to number example:: - - os> source=people | eval `cbool` = CAST(true as int), `cstring` = CAST('1' as int) | fields `cbool`, `cstring` - fetched rows / total rows = 1/1 - +-------+---------+ - | cbool | cstring | - |-------+---------| - | 1 | 1 | - +-------+---------+ - -Cast to date example:: - - os> source=people | eval `cdate` = CAST('2012-08-07' as date), `ctime` = CAST('01:01:01' as time), `ctimestamp` = CAST('2012-08-07 01:01:01' as timestamp) | fields `cdate`, `ctime`, `ctimestamp` - fetched rows / total rows = 1/1 - +------------+----------+---------------------+ - | cdate | ctime | ctimestamp | - |------------+----------+---------------------| - | 2012-08-07 | 01:01:01 | 2012-08-07 01:01:01 | - +------------+----------+---------------------+ - -Cast function can be chained:: - - os> source=people | eval `cbool` = CAST(CAST(true as string) as boolean) | fields `cbool` - fetched rows / total rows = 1/1 - +-------+ - | cbool | - |-------| - | True | - +-------+ - - -IMPLICIT (AUTO) TYPE CONVERSION -------------------------------- - -Implicit conversion is automatic casting. When a function does not have an exact match for the -input types, the engine looks for another signature that can safely work with the values. It picks -the option that requires the least stretching of the original types, so you can mix literals and -fields without adding ``CAST`` everywhere. - -String to numeric ->>>>>>>>>>>>>>>>> - -When a string stands in for a number we simply parse the text: - -- The value must be something like ``"3.14"`` or ``"42"``. Anything else causes the query to fail. -- If a string appears next to numeric arguments, it is treated as a ``DOUBLE`` so the numeric - overload of the function can run. - -Use string in arithmetic operator example :: - - os> source=people | eval divide="5"/10, multiply="5" * 10, add="5" + 10, minus="5" - 10, concat="5" + "5" | fields divide, multiply, add, minus, concat - fetched rows / total rows = 1/1 - +--------+----------+------+-------+--------+ - | divide | multiply | add | minus | concat | - |--------+----------+------+-------+--------| - | 0.5 | 50.0 | 15.0 | -5.0 | 55 | - +--------+----------+------+-------+--------+ - -Use string in comparison operator example :: - - os> source=people | eval e="1000"==1000, en="1000"!=1000, ed="1000"==1000.0, edn="1000"!=1000.0, l="1000">999, ld="1000">999.9, i="malformed"==1000 | fields e, en, ed, edn, l, ld, i - fetched rows / total rows = 1/1 - +------+-------+------+-------+------+------+------+ - | e | en | ed | edn | l | ld | i | - |------+-------+------+-------+------+------+------| - | True | False | True | False | True | True | null | - +------+-------+------+-------+------+------+------+ - - -TOSTRING ------------ - -Description ->>>>>>>>>>> -The following usage options are available, depending on the parameter types and the number of parameters. - -Usage with format type: tostring(ANY, [format]): Converts the value in first argument to provided format type string in second argument. If second argument is not provided, then it converts to default string representation. -Return type: string - -Usage for boolean parameter without format type tostring(boolean): Converts the string to 'TRUE' or 'FALSE'. -Return type: string - -You can use this function with the eval commands and as part of eval expressions. If first argument can be any valid type , second argument is optional and if provided , it needs to be format name to convert to where first argument contains only numbers. If first argument is boolean, then second argument is not used even if its provided. - -Format types: - -a) "binary" Converts a number to a binary value. -b) "hex" Converts the number to a hexadecimal value. -c) "commas" Formats the number with commas. If the number includes a decimal, the function rounds the number to nearest two decimal places. -d) "duration" Converts the value in seconds to the readable time format HH:MM:SS. -e) "duration_millis" Converts the value in milliseconds to the readable time format HH:MM:SS. - -The format argument is optional and is only used when the value argument is a number. The tostring function supports the following formats. - -Basic examples: - -You can use this function to convert a number to a string of its binary representation. -Example:: -city, city.name, city.location.latitude - os> source=accounts | where firstname = "Amber" | eval balance_binary = tostring(balance, "binary") | fields firstname, balance_binary, balance - fetched rows / total rows = 1/1 - +-----------+------------------+---------+ - | firstname | balance_binary | balance | - |-----------+------------------+---------| - | Amber | 1001100100111001 | 39225 | - +-----------+------------------+---------+ - - -You can use this function to convert a number to a string of its hex representation. -Example:: - - os> source=accounts | where firstname = "Amber" | eval balance_hex = tostring(balance, "hex") | fields firstname, balance_hex, balance - fetched rows / total rows = 1/1 - +-----------+-------------+---------+ - | firstname | balance_hex | balance | - |-----------+-------------+---------| - | Amber | 9939 | 39225 | - +-----------+-------------+---------+ - -The following example formats the column totalSales to display values with commas. -Example:: - - os> source=accounts | where firstname = "Amber" | eval balance_commas = tostring(balance, "commas") | fields firstname, balance_commas, balance - fetched rows / total rows = 1/1 - +-----------+----------------+---------+ - | firstname | balance_commas | balance | - |-----------+----------------+---------| - | Amber | 39,225 | 39225 | - +-----------+----------------+---------+ - -The following example converts number of seconds to HH:MM:SS format representing hours, minutes and seconds. -Example:: - - os> source=accounts | where firstname = "Amber" | eval duration = tostring(6500, "duration") | fields firstname, duration - fetched rows / total rows = 1/1 - +-----------+----------+ - | firstname | duration | - |-----------+----------| - | Amber | 01:48:20 | - +-----------+----------+ - -The following example for converts boolean parameter to string. -Example:: - - os> source=accounts | where firstname = "Amber"| eval `boolean_str` = tostring(1=1)| fields `boolean_str` - fetched rows / total rows = 1/1 - +-------------+ - | boolean_str | - |-------------| - | TRUE | - +-------------+ - diff --git a/docs/user/ppl/functions/cryptographic.md b/docs/user/ppl/functions/cryptographic.md new file mode 100644 index 00000000000..a46fb85ff6f --- /dev/null +++ b/docs/user/ppl/functions/cryptographic.md @@ -0,0 +1,101 @@ +# PPL Cryptographic Functions + +## MD5 + +### Description + +Version: 3.1.0 +Usage: `md5(str)` calculates the MD5 digest and returns the value as a 32 character hex string. +Argument type: STRING +Return type: STRING +Example + +```ppl +source=people +| eval `MD5('hello')` = MD5('hello') +| fields `MD5('hello')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------------------------+ +| MD5('hello') | +|----------------------------------| +| 5d41402abc4b2a76b9719d911017c592 | ++----------------------------------+ +``` + +## SHA1 + +### Description + +Version: 3.1.0 +Usage: `sha1(str)` returns the hex string result of SHA-1. +Argument type: STRING +Return type: STRING +Example + +```ppl +source=people +| eval `SHA1('hello')` = SHA1('hello') +| fields `SHA1('hello')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++------------------------------------------+ +| SHA1('hello') | +|------------------------------------------| +| aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d | ++------------------------------------------+ +``` + +## SHA2 + +### Description + +Version: 3.1.0 +Usage: `sha2(str, numBits)` returns the hex string result of SHA-2 family of hash functions (SHA-224, SHA-256, SHA-384, and SHA-512). +The numBits indicates the desired bit length of the result, which must have a value of 224, 256, 384, or 512. +Argument type: STRING, INTEGER +Return type: STRING +Example + +```ppl +source=people +| eval `SHA2('hello',256)` = SHA2('hello',256) +| fields `SHA2('hello',256)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++------------------------------------------------------------------+ +| SHA2('hello',256) | +|------------------------------------------------------------------| +| 2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824 | ++------------------------------------------------------------------+ +``` + +```ppl +source=people +| eval `SHA2('hello',512)` = SHA2('hello',512) +| fields `SHA2('hello',512)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------------------------------------------------------------------------------------------------------------------------+ +| SHA2('hello',512) | +|----------------------------------------------------------------------------------------------------------------------------------| +| 9b71d224bd62f3785d96d46ad3ea3d73319bfbc2890caadae2dff72519673ca72323c3d99ba5c11d7c7acc6e14b8c5da0c4663475c2e5c3adef46f73bcdec043 | ++----------------------------------------------------------------------------------------------------------------------------------+ +``` + \ No newline at end of file diff --git a/docs/user/ppl/functions/cryptographic.rst b/docs/user/ppl/functions/cryptographic.rst deleted file mode 100644 index c31121c3014..00000000000 --- a/docs/user/ppl/functions/cryptographic.rst +++ /dev/null @@ -1,90 +0,0 @@ -=========================== -PPL Cryptographic Functions -=========================== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 1 - -MD5 ---- - -Description ->>>>>>>>>>> - -Version: 3.1.0 - -Usage: ``md5(str)`` calculates the MD5 digest and returns the value as a 32 character hex string. - -Argument type: STRING - -Return type: STRING - -Example:: - - os> source=people | eval `MD5('hello')` = MD5('hello') | fields `MD5('hello')` - fetched rows / total rows = 1/1 - +----------------------------------+ - | MD5('hello') | - |----------------------------------| - | 5d41402abc4b2a76b9719d911017c592 | - +----------------------------------+ - -SHA1 ----- - -Description ->>>>>>>>>>> - -Version: 3.1.0 - -Usage: ``sha1(str)`` returns the hex string result of SHA-1. - -Argument type: STRING - -Return type: STRING - -Example:: - - os> source=people | eval `SHA1('hello')` = SHA1('hello') | fields `SHA1('hello')` - fetched rows / total rows = 1/1 - +------------------------------------------+ - | SHA1('hello') | - |------------------------------------------| - | aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d | - +------------------------------------------+ - -SHA2 ----- - -Description ->>>>>>>>>>> - -Version: 3.1.0 - -Usage: ``sha2(str, numBits)`` returns the hex string result of SHA-2 family of hash functions (SHA-224, SHA-256, SHA-384, and SHA-512). -The numBits indicates the desired bit length of the result, which must have a value of 224, 256, 384, or 512. - -Argument type: STRING, INTEGER - -Return type: STRING - -Example:: - - os> source=people | eval `SHA2('hello',256)` = SHA2('hello',256) | fields `SHA2('hello',256)` - fetched rows / total rows = 1/1 - +------------------------------------------------------------------+ - | SHA2('hello',256) | - |------------------------------------------------------------------| - | 2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824 | - +------------------------------------------------------------------+ - - os> source=people | eval `SHA2('hello',512)` = SHA2('hello',512) | fields `SHA2('hello',512)` - fetched rows / total rows = 1/1 - +----------------------------------------------------------------------------------------------------------------------------------+ - | SHA2('hello',512) | - |----------------------------------------------------------------------------------------------------------------------------------| - | 9b71d224bd62f3785d96d46ad3ea3d73319bfbc2890caadae2dff72519673ca72323c3d99ba5c11d7c7acc6e14b8c5da0c4663475c2e5c3adef46f73bcdec043 | - +----------------------------------------------------------------------------------------------------------------------------------+ diff --git a/docs/user/ppl/functions/datetime.md b/docs/user/ppl/functions/datetime.md new file mode 100644 index 00000000000..d8c72389d18 --- /dev/null +++ b/docs/user/ppl/functions/datetime.md @@ -0,0 +1,2782 @@ +# Date and Time Functions + + All PPL date and time functions use the UTC time zone. Both input and output values are interpreted as UTC. + For instance, an input timestamp literal like '2020-08-26 01:01:01' is assumed to be in UTC, and the now() + function also returns the current date and time in UTC. +## ADDDATE + +### Description + +Usage: adddate(date, INTERVAL expr unit) / adddate(date, days) adds the interval of second argument to date; adddate(date, days) adds the second argument as integer number of days to date. +If first argument is TIME, today's date is used; if first argument is DATE, time at midnight is used. +Argument type: DATE/TIMESTAMP/TIME, INTERVAL/LONG +Return type map: +(DATE/TIMESTAMP/TIME, INTERVAL) -> TIMESTAMP +(DATE, LONG) -> DATE +(TIMESTAMP/TIME, LONG) -> TIMESTAMP +Synonyms: [DATE_ADD](#date_add) when invoked with the INTERVAL form of the second argument. +Antonyms: [SUBDATE](#subdate) +Example + +```ppl +source=people +| eval `'2020-08-26' + 1h` = ADDDATE(DATE('2020-08-26'), INTERVAL 1 HOUR), `'2020-08-26' + 1` = ADDDATE(DATE('2020-08-26'), 1), `ts '2020-08-26 01:01:01' + 1` = ADDDATE(TIMESTAMP('2020-08-26 01:01:01'), 1) +| fields `'2020-08-26' + 1h`, `'2020-08-26' + 1`, `ts '2020-08-26 01:01:01' + 1` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------------+------------------+------------------------------+ +| '2020-08-26' + 1h | '2020-08-26' + 1 | ts '2020-08-26 01:01:01' + 1 | +|---------------------+------------------+------------------------------| +| 2020-08-26 01:00:00 | 2020-08-27 | 2020-08-27 01:01:01 | ++---------------------+------------------+------------------------------+ +``` + +## ADDTIME + +### Description + +Usage: addtime(expr1, expr2) adds expr2 to expr1 and returns the result. If argument is TIME, today's date is used; if argument is DATE, time at midnight is used. +Argument type: DATE/TIMESTAMP/TIME, DATE/TIMESTAMP/TIME +Return type map: +(DATE/TIMESTAMP, DATE/TIMESTAMP/TIME) -> TIMESTAMP +(TIME, DATE/TIMESTAMP/TIME) -> TIME +Antonyms: [SUBTIME](#subtime) +Example + +```ppl +source=people +| eval `'2008-12-12' + 0` = ADDTIME(DATE('2008-12-12'), DATE('2008-11-15')) +| fields `'2008-12-12' + 0` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------------+ +| '2008-12-12' + 0 | +|---------------------| +| 2008-12-12 00:00:00 | ++---------------------+ +``` + +```ppl +source=people +| eval `'23:59:59' + 0` = ADDTIME(TIME('23:59:59'), DATE('2004-01-01')) +| fields `'23:59:59' + 0` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------+ +| '23:59:59' + 0 | +|----------------| +| 23:59:59 | ++----------------+ +``` + +```ppl +source=people +| eval `'2004-01-01' + '23:59:59'` = ADDTIME(DATE('2004-01-01'), TIME('23:59:59')) +| fields `'2004-01-01' + '23:59:59'` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------------------+ +| '2004-01-01' + '23:59:59' | +|---------------------------| +| 2004-01-01 23:59:59 | ++---------------------------+ +``` + +```ppl +source=people +| eval `'10:20:30' + '00:05:42'` = ADDTIME(TIME('10:20:30'), TIME('00:05:42')) +| fields `'10:20:30' + '00:05:42'` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------------+ +| '10:20:30' + '00:05:42' | +|-------------------------| +| 10:26:12 | ++-------------------------+ +``` + +```ppl +source=people +| eval `'2007-02-28 10:20:30' + '20:40:50'` = ADDTIME(TIMESTAMP('2007-02-28 10:20:30'), TIMESTAMP('2002-03-04 20:40:50')) +| fields `'2007-02-28 10:20:30' + '20:40:50'` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++------------------------------------+ +| '2007-02-28 10:20:30' + '20:40:50' | +|------------------------------------| +| 2007-03-01 07:01:20 | ++------------------------------------+ +``` + +## CONVERT_TZ + +### Description + +Usage: convert_tz(timestamp, from_timezone, to_timezone) constructs a local timestamp converted from the from_timezone to the to_timezone. CONVERT_TZ returns null when any of the three function arguments are invalid, i.e. timestamp is not in the format yyyy-MM-dd HH:mm:ss or the timeszone is not in (+/-)HH:mm. It also is invalid for invalid dates, such as February 30th and invalid timezones, which are ones outside of -13:59 and +14:00. +Argument type: TIMESTAMP/STRING, STRING, STRING +Return type: TIMESTAMP +Conversion from +00:00 timezone to +10:00 timezone. Returns the timestamp argument converted from +00:00 to +10:00 +Example + +```ppl +source=people +| eval `convert_tz('2008-05-15 12:00:00','+00:00','+10:00')` = convert_tz('2008-05-15 12:00:00','+00:00','+10:00') +| fields `convert_tz('2008-05-15 12:00:00','+00:00','+10:00')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------------------------------------+ +| convert_tz('2008-05-15 12:00:00','+00:00','+10:00') | +|-----------------------------------------------------| +| 2008-05-15 22:00:00 | ++-----------------------------------------------------+ +``` + +The valid timezone range for convert_tz is (-13:59, +14:00) inclusive. Timezones outside of the range, such as +15:00 in this example will return null. +Example + +```ppl +source=people +| eval `convert_tz('2008-05-15 12:00:00','+00:00','+15:00')` = convert_tz('2008-05-15 12:00:00','+00:00','+15:00') +| fields `convert_tz('2008-05-15 12:00:00','+00:00','+15:00')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------------------------------------+ +| convert_tz('2008-05-15 12:00:00','+00:00','+15:00') | +|-----------------------------------------------------| +| null | ++-----------------------------------------------------+ +``` + +Conversion from a positive timezone to a negative timezone that goes over date line. +Example + +```ppl +source=people +| eval `convert_tz('2008-05-15 12:00:00','+03:30','-10:00')` = convert_tz('2008-05-15 12:00:00','+03:30','-10:00') +| fields `convert_tz('2008-05-15 12:00:00','+03:30','-10:00')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------------------------------------+ +| convert_tz('2008-05-15 12:00:00','+03:30','-10:00') | +|-----------------------------------------------------| +| 2008-05-14 22:30:00 | ++-----------------------------------------------------+ +``` + +Valid dates are required in convert_tz, invalid dates such as April 31st (not a date in the Gregorian calendar) will result in null. +Example + +```ppl +source=people +| eval `convert_tz('2008-04-31 12:00:00','+03:30','-10:00')` = convert_tz('2008-04-31 12:00:00','+03:30','-10:00') +| fields `convert_tz('2008-04-31 12:00:00','+03:30','-10:00')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------------------------------------+ +| convert_tz('2008-04-31 12:00:00','+03:30','-10:00') | +|-----------------------------------------------------| +| null | ++-----------------------------------------------------+ +``` + +Valid dates are required in convert_tz, invalid dates such as February 30th (not a date in the Gregorian calendar) will result in null. +Example + +```ppl +source=people +| eval `convert_tz('2008-02-30 12:00:00','+03:30','-10:00')` = convert_tz('2008-02-30 12:00:00','+03:30','-10:00') +| fields `convert_tz('2008-02-30 12:00:00','+03:30','-10:00')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------------------------------------+ +| convert_tz('2008-02-30 12:00:00','+03:30','-10:00') | +|-----------------------------------------------------| +| null | ++-----------------------------------------------------+ +``` + +February 29th 2008 is a valid date because it is a leap year. +Example + +```ppl +source=people +| eval `convert_tz('2008-02-29 12:00:00','+03:30','-10:00')` = convert_tz('2008-02-29 12:00:00','+03:30','-10:00') +| fields `convert_tz('2008-02-29 12:00:00','+03:30','-10:00')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------------------------------------+ +| convert_tz('2008-02-29 12:00:00','+03:30','-10:00') | +|-----------------------------------------------------| +| 2008-02-28 22:30:00 | ++-----------------------------------------------------+ +``` + +Valid dates are required in convert_tz, invalid dates such as February 29th 2007 (2007 is not a leap year) will result in null. +Example + +```ppl +source=people +| eval `convert_tz('2007-02-29 12:00:00','+03:30','-10:00')` = convert_tz('2007-02-29 12:00:00','+03:30','-10:00') +| fields `convert_tz('2007-02-29 12:00:00','+03:30','-10:00')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------------------------------------+ +| convert_tz('2007-02-29 12:00:00','+03:30','-10:00') | +|-----------------------------------------------------| +| null | ++-----------------------------------------------------+ +``` + +The valid timezone range for convert_tz is (-13:59, +14:00) inclusive. Timezones outside of the range, such as +14:01 in this example will return null. +Example + +```ppl +source=people +| eval `convert_tz('2008-02-01 12:00:00','+14:01','+00:00')` = convert_tz('2008-02-01 12:00:00','+14:01','+00:00') +| fields `convert_tz('2008-02-01 12:00:00','+14:01','+00:00')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------------------------------------+ +| convert_tz('2008-02-01 12:00:00','+14:01','+00:00') | +|-----------------------------------------------------| +| null | ++-----------------------------------------------------+ +``` + +The valid timezone range for convert_tz is (-13:59, +14:00) inclusive. Timezones outside of the range, such as +14:00 in this example will return a correctly converted date time object. +Example + +```ppl +source=people +| eval `convert_tz('2008-02-01 12:00:00','+14:00','+00:00')` = convert_tz('2008-02-01 12:00:00','+14:00','+00:00') +| fields `convert_tz('2008-02-01 12:00:00','+14:00','+00:00')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------------------------------------+ +| convert_tz('2008-02-01 12:00:00','+14:00','+00:00') | +|-----------------------------------------------------| +| 2008-01-31 22:00:00 | ++-----------------------------------------------------+ +``` + +The valid timezone range for convert_tz is (-13:59, +14:00) inclusive. Timezones outside of the range, such as -14:00 will result in null +Example + +```ppl +source=people +| eval `convert_tz('2008-02-01 12:00:00','-14:00','+00:00')` = convert_tz('2008-02-01 12:00:00','-14:00','+00:00') +| fields `convert_tz('2008-02-01 12:00:00','-14:00','+00:00')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------------------------------------+ +| convert_tz('2008-02-01 12:00:00','-14:00','+00:00') | +|-----------------------------------------------------| +| null | ++-----------------------------------------------------+ +``` + +The valid timezone range for convert_tz is (-13:59, +14:00) inclusive. This timezone is within range so it is valid and will convert the time. +Example + +```ppl +source=people +| eval `convert_tz('2008-02-01 12:00:00','-13:59','+00:00')` = convert_tz('2008-02-01 12:00:00','-13:59','+00:00') +| fields `convert_tz('2008-02-01 12:00:00','-13:59','+00:00')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------------------------------------+ +| convert_tz('2008-02-01 12:00:00','-13:59','+00:00') | +|-----------------------------------------------------| +| 2008-02-02 01:59:00 | ++-----------------------------------------------------+ +``` + +## CURDATE + +### Description + +Returns the current date as a value in 'YYYY-MM-DD' format. +CURDATE() returns the current date in UTC at the time the statement is executed. +Return type: DATE +Specification: CURDATE() -> DATE +Example + +```ppl ignore +source=people +| eval `CURDATE()` = CURDATE() +| fields `CURDATE()` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++------------+ +| CURDATE() | +|------------| +| 2025-08-02 | ++------------+ +``` + +## CURRENT_DATE + +### Description + +`CURRENT_DATE()` is a synonym for [CURDATE()](#curdate). +Example + +```ppl ignore +source=people +| eval `CURRENT_DATE()` = CURRENT_DATE() +| fields `CURRENT_DATE()` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++------------------+ +| CURRENT_DATE() | +|------------------+ +| 2025-08-02 | ++------------------+ +``` + +## CURRENT_TIME + +### Description + +`CURRENT_TIME()` is a synonym for [CURTIME()](#curtime). +Example + +```ppl ignore +source=people +| eval `CURRENT_TIME()` = CURRENT_TIME() +| fields `CURRENT_TIME()` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++------------------+ +| CURRENT_TIME() | +|------------------+ +| 15:39:05 | ++------------------+ +``` + +## CURRENT_TIMESTAMP + +### Description + +`CURRENT_TIMESTAMP()` is a synonym for [NOW()](#now). +Example + +```ppl ignore +source=people +| eval `CURRENT_TIMESTAMP()` = CURRENT_TIMESTAMP() +| fields `CURRENT_TIMESTAMP()` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------+ +| CURRENT_TIMESTAMP() | +|-----------------------+ +| 2025-08-02 15:54:19 | ++-----------------------+ +``` + +## CURTIME + +### Description + +Returns the current time as a value in 'hh:mm:ss' format in the UTC time zone. +CURTIME() returns the time at which the statement began to execute as [NOW()](#now) does. +Return type: TIME +Specification: CURTIME() -> TIME +Example + +```ppl ignore +source=people +| eval `value_1` = CURTIME(), `value_2` = CURTIME() +| fields `value_1`, `value_2` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------+----------+ +| value_1 | value_2 | +|----------+----------| +| 15:39:05 | 15:39:05 | ++----------+----------+ +``` + +## DATE + +### Description + +Usage: date(expr) constructs a date type with the input string expr as a date. If the argument is of date/timestamp, it extracts the date value part from the expression. +Argument type: STRING/DATE/TIMESTAMP +Return type: DATE +Example + +```ppl +source=people +| eval `DATE('2020-08-26')` = DATE('2020-08-26') +| fields `DATE('2020-08-26')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------+ +| DATE('2020-08-26') | +|--------------------| +| 2020-08-26 | ++--------------------+ +``` + +```ppl +source=people +| eval `DATE(TIMESTAMP('2020-08-26 13:49:00'))` = DATE(TIMESTAMP('2020-08-26 13:49:00')) +| fields `DATE(TIMESTAMP('2020-08-26 13:49:00'))` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------------------------------+ +| DATE(TIMESTAMP('2020-08-26 13:49:00')) | +|----------------------------------------| +| 2020-08-26 | ++----------------------------------------+ +``` + +```ppl +source=people +| eval `DATE('2020-08-26 13:49')` = DATE('2020-08-26 13:49') +| fields `DATE('2020-08-26 13:49')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------------+ +| DATE('2020-08-26 13:49') | +|--------------------------| +| 2020-08-26 | ++--------------------------+ +``` + +```ppl +source=people +| eval `DATE('2020-08-26 13:49')` = DATE('2020-08-26 13:49') +| fields `DATE('2020-08-26 13:49')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------------+ +| DATE('2020-08-26 13:49') | +|--------------------------| +| 2020-08-26 | ++--------------------------+ +``` + +## DATE_ADD + +### Description + +Usage: date_add(date, INTERVAL expr unit) adds the interval expr to date. If first argument is TIME, today's date is used; if first argument is DATE, time at midnight is used. +Argument type: DATE/TIMESTAMP/TIME, INTERVAL +Return type: TIMESTAMP +Synonyms: [ADDDATE](#adddate) +Antonyms: [DATE_SUB](#date_sub) +Example + +```ppl +source=people +| eval `'2020-08-26' + 1h` = DATE_ADD(DATE('2020-08-26'), INTERVAL 1 HOUR), `ts '2020-08-26 01:01:01' + 1d` = DATE_ADD(TIMESTAMP('2020-08-26 01:01:01'), INTERVAL 1 DAY) +| fields `'2020-08-26' + 1h`, `ts '2020-08-26 01:01:01' + 1d` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------------+-------------------------------+ +| '2020-08-26' + 1h | ts '2020-08-26 01:01:01' + 1d | +|---------------------+-------------------------------| +| 2020-08-26 01:00:00 | 2020-08-27 01:01:01 | ++---------------------+-------------------------------+ +``` + +## DATE_FORMAT + +### Description + +Usage: date_format(date, format) formats the date argument using the specifiers in the format argument. +If an argument of type TIME is provided, the local date is used. +The following table describes the available specifier arguments. + + +| Specifier | Description | +| --- | --- | +| %a | Abbreviated weekday name (Sun..Sat) | +| %b | Abbreviated month name (Jan..Dec) | +| %c | Month, numeric (0..12) | +| %D | Day of the month with English suffix (0th, 1st, 2nd, 3rd, ...) | +| %d | Day of the month, numeric (00..31) | +| %e | Day of the month, numeric (0..31) | +| %f | Microseconds (000000..999999) | +| %H | Hour (00..23) | +| %h | Hour (01..12) | +| %I | Hour (01..12) | +| %i | Minutes, numeric (00..59) | +| %j | Day of year (001..366) | +| %k | Hour (0..23) | +| %l | Hour (1..12) | +| %M | Month name (January..December) | +| %m | Month, numeric (00..12) | +| %p | AM or PM | +| %r | Time, 12-hour (hh:mm:ss followed by AM or PM) | +| %S | Seconds (00..59) | +| %s | Seconds (00..59) | +| %T | Time, 24-hour (hh:mm:ss) | +| %U | Week (00..53), where Sunday is the first day of the week; WEEK() mode 0 | +| %u | Week (00..53), where Monday is the first day of the week; WEEK() mode 1 | +| %V | Week (01..53), where Sunday is the first day of the week; WEEK() mode 2; used with %X | +| %v | Week (01..53), where Monday is the first day of the week; WEEK() mode 3; used with %x | +| %W | Weekday name (Sunday..Saturday) | +| %w | Day of the week (0=Sunday..6=Saturday) | +| %X | Year for the week where Sunday is the first day of the week, numeric, four digits; used with %V | +| %x | Year for the week, where Monday is the first day of the week, numeric, four digits; used with %v | +| %Y | Year, numeric, four digits | +| %y | Year, numeric (two digits) | +| %% | A literal % character | +| %x | x, for any “x” not listed above | +| x | x, for any smallcase/uppercase alphabet except [aydmshiHIMYDSEL] | + + +Argument type: STRING/DATE/TIME/TIMESTAMP, STRING +Return type: STRING +Example + +```ppl +source=people +| eval `DATE_FORMAT('1998-01-31 13:14:15.012345', '%T.%f')` = DATE_FORMAT('1998-01-31 13:14:15.012345', '%T.%f'), `DATE_FORMAT(TIMESTAMP('1998-01-31 13:14:15.012345'), '%Y-%b-%D %r')` = DATE_FORMAT(TIMESTAMP('1998-01-31 13:14:15.012345'), '%Y-%b-%D %r') +| fields `DATE_FORMAT('1998-01-31 13:14:15.012345', '%T.%f')`, `DATE_FORMAT(TIMESTAMP('1998-01-31 13:14:15.012345'), '%Y-%b-%D %r')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------------------------------------------+---------------------------------------------------------------------+ +| DATE_FORMAT('1998-01-31 13:14:15.012345', '%T.%f') | DATE_FORMAT(TIMESTAMP('1998-01-31 13:14:15.012345'), '%Y-%b-%D %r') | +|----------------------------------------------------+---------------------------------------------------------------------| +| 13:14:15.012345 | 1998-Jan-31st 01:14:15 PM | ++----------------------------------------------------+---------------------------------------------------------------------+ +``` + +## DATETIME + +### Description + +Usage: DATETIME(timestamp)/ DATETIME(date, to_timezone) Converts the datetime to a new timezone +Argument type: timestamp/STRING +Return type map: +(TIMESTAMP, STRING) -> TIMESTAMP +(TIMESTAMP) -> TIMESTAMP +Converting timestamp with timezone to the second argument timezone. +Example + +```ppl +source=people +| eval `DATETIME('2004-02-28 23:00:00-10:00', '+10:00')` = DATETIME('2004-02-28 23:00:00-10:00', '+10:00') +| fields `DATETIME('2004-02-28 23:00:00-10:00', '+10:00')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------------------------------------+ +| DATETIME('2004-02-28 23:00:00-10:00', '+10:00') | +|-------------------------------------------------| +| 2004-02-29 19:00:00 | ++-------------------------------------------------+ +``` + +The valid timezone range for convert_tz is (-13:59, +14:00) inclusive. Timezones outside of the range will result in null. +Example + +```ppl +source=people +| eval `DATETIME('2008-01-01 02:00:00', '-14:00')` = DATETIME('2008-01-01 02:00:00', '-14:00') +| fields `DATETIME('2008-01-01 02:00:00', '-14:00')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------------------------------+ +| DATETIME('2008-01-01 02:00:00', '-14:00') | +|-------------------------------------------| +| null | ++-------------------------------------------+ +``` + +## DATE_SUB + +### Description + +Usage: date_sub(date, INTERVAL expr unit) subtracts the interval expr from date. If first argument is TIME, today's date is used; if first argument is DATE, time at midnight is used. +Argument type: DATE/TIMESTAMP/TIME, INTERVAL +Return type: TIMESTAMP +Synonyms: [SUBDATE](#subdate) +Antonyms: [DATE_ADD](#date_add) +Example + +```ppl +source=people +| eval `'2008-01-02' - 31d` = DATE_SUB(DATE('2008-01-02'), INTERVAL 31 DAY), `ts '2020-08-26 01:01:01' + 1h` = DATE_SUB(TIMESTAMP('2020-08-26 01:01:01'), INTERVAL 1 HOUR) +| fields `'2008-01-02' - 31d`, `ts '2020-08-26 01:01:01' + 1h` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------------+-------------------------------+ +| '2008-01-02' - 31d | ts '2020-08-26 01:01:01' + 1h | +|---------------------+-------------------------------| +| 2007-12-02 00:00:00 | 2020-08-26 00:01:01 | ++---------------------+-------------------------------+ +``` + +## DATEDIFF + +Usage: Calculates the difference of date parts of given values. If the first argument is time, today's date is used. +Argument type: DATE/TIMESTAMP/TIME, DATE/TIMESTAMP/TIME +Return type: LONG +Example + +```ppl +source=people +| eval `'2000-01-02' - '2000-01-01'` = DATEDIFF(TIMESTAMP('2000-01-02 00:00:00'), TIMESTAMP('2000-01-01 23:59:59')), `'2001-02-01' - '2004-01-01'` = DATEDIFF(DATE('2001-02-01'), TIMESTAMP('2004-01-01 00:00:00')), `today - today` = DATEDIFF(TIME('23:59:59'), TIME('00:00:00')) +| fields `'2000-01-02' - '2000-01-01'`, `'2001-02-01' - '2004-01-01'`, `today - today` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------------+-----------------------------+---------------+ +| '2000-01-02' - '2000-01-01' | '2001-02-01' - '2004-01-01' | today - today | +|-----------------------------+-----------------------------+---------------| +| 1 | -1064 | 0 | ++-----------------------------+-----------------------------+---------------+ +``` + +## DAY + +### Description + +Usage: day(date) extracts the day of the month for date, in the range 1 to 31. +Argument type: STRING/DATE/TIMESTAMP +Return type: INTEGER +Synonyms: [DAYOFMONTH](#dayofmonth), [DAY_OF_MONTH](#day_of_month) +Example + +```ppl +source=people +| eval `DAY(DATE('2020-08-26'))` = DAY(DATE('2020-08-26')) +| fields `DAY(DATE('2020-08-26'))` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------------+ +| DAY(DATE('2020-08-26')) | +|-------------------------| +| 26 | ++-------------------------+ +``` + +## DAYNAME + +### Description + +Usage: dayname(date) returns the name of the weekday for date, including Monday, Tuesday, Wednesday, Thursday, Friday, Saturday and Sunday. +Argument type: STRING/DATE/TIMESTAMP +Return type: STRING +Example + +```ppl +source=people +| eval `DAYNAME(DATE('2020-08-26'))` = DAYNAME(DATE('2020-08-26')) +| fields `DAYNAME(DATE('2020-08-26'))` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------------+ +| DAYNAME(DATE('2020-08-26')) | +|-----------------------------| +| Wednesday | ++-----------------------------+ +``` + +## DAYOFMONTH + +### Description + +Usage: dayofmonth(date) extracts the day of the month for date, in the range 1 to 31. +Argument type: STRING/DATE/TIMESTAMP +Return type: INTEGER +Synonyms: [DAY](#day), [DAY_OF_MONTH](#day_of_month) +Example + +```ppl +source=people +| eval `DAYOFMONTH(DATE('2020-08-26'))` = DAYOFMONTH(DATE('2020-08-26')) +| fields `DAYOFMONTH(DATE('2020-08-26'))` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------------------+ +| DAYOFMONTH(DATE('2020-08-26')) | +|--------------------------------| +| 26 | ++--------------------------------+ +``` + +## DAY_OF_MONTH + +### Description + +Usage: day_of_month(date) extracts the day of the month for date, in the range 1 to 31. +Argument type: STRING/DATE/TIMESTAMP +Return type: INTEGER +Synonyms: [DAY](#day), [DAYOFMONTH](#dayofmonth) +Example + +```ppl +source=people +| eval `DAY_OF_MONTH(DATE('2020-08-26'))` = DAY_OF_MONTH(DATE('2020-08-26')) +| fields `DAY_OF_MONTH(DATE('2020-08-26'))` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------------------------+ +| DAY_OF_MONTH(DATE('2020-08-26')) | +|----------------------------------| +| 26 | ++----------------------------------+ +``` + +## DAYOFWEEK + +### Description + +Usage: dayofweek(date) returns the weekday index for date (1 = Sunday, 2 = Monday, ..., 7 = Saturday). +Argument type: STRING/DATE/TIMESTAMP +Return type: INTEGER +Synonyms: [DAY_OF_WEEK](#day_of_week) +Example + +```ppl +source=people +| eval `DAYOFWEEK(DATE('2020-08-26'))` = DAYOFWEEK(DATE('2020-08-26')) +| fields `DAYOFWEEK(DATE('2020-08-26'))` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------------------+ +| DAYOFWEEK(DATE('2020-08-26')) | +|-------------------------------| +| 4 | ++-------------------------------+ +``` + +## DAY_OF_WEEK + +### Description + +Usage: day_of_week(date) returns the weekday index for date (1 = Sunday, 2 = Monday, ..., 7 = Saturday). +Argument type: STRING/DATE/TIMESTAMP +Return type: INTEGER +Synonyms: [DAYOFWEEK](#dayofweek) +Example + +```ppl +source=people +| eval `DAY_OF_WEEK(DATE('2020-08-26'))` = DAY_OF_WEEK(DATE('2020-08-26')) +| fields `DAY_OF_WEEK(DATE('2020-08-26'))` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------------------------+ +| DAY_OF_WEEK(DATE('2020-08-26')) | +|---------------------------------| +| 4 | ++---------------------------------+ +``` + +## DAYOFYEAR + +### Description + +Usage: dayofyear(date) returns the day of the year for date, in the range 1 to 366. +Argument type: STRING/DATE/TIMESTAMP +Return type: INTEGER +Synonyms: [DAY_OF_YEAR](#day_of_year) +Example + +```ppl +source=people +| eval `DAYOFYEAR(DATE('2020-08-26'))` = DAYOFYEAR(DATE('2020-08-26')) +| fields `DAYOFYEAR(DATE('2020-08-26'))` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------------------+ +| DAYOFYEAR(DATE('2020-08-26')) | +|-------------------------------| +| 239 | ++-------------------------------+ +``` + +## DAY_OF_YEAR + +### Description + +Usage: day_of_year(date) returns the day of the year for date, in the range 1 to 366. +Argument type: STRING/DATE/TIMESTAMP +Return type: INTEGER +Synonyms: [DAYOFYEAR](#dayofyear) +Example + +```ppl +source=people +| eval `DAY_OF_YEAR(DATE('2020-08-26'))` = DAY_OF_YEAR(DATE('2020-08-26')) +| fields `DAY_OF_YEAR(DATE('2020-08-26'))` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------------------------+ +| DAY_OF_YEAR(DATE('2020-08-26')) | +|---------------------------------| +| 239 | ++---------------------------------+ +``` + +## EXTRACT + +### Description + +Usage: extract(part FROM date) returns a LONG with digits in order according to the given 'part' arguments. +The specific format of the returned long is determined by the table below. +Argument type: PART, where PART is one of the following tokens in the table below. +The format specifiers found in this table are the same as those found in the [DATE_FORMAT](#date_format) function. +The following table describes the mapping of a 'part' to a particular format. + + +| Part | Format | +| --- | --- | +| MICROSECOND | %f | +| SECOND | %s | +| MINUTE | %i | +| HOUR | %H | +| DAY | %d | +| WEEK | %X | +| MONTH | %m | +| YEAR | %V | +| SECOND_MICROSECOND | %s%f | +| MINUTE_MICROSECOND | %i%s%f | +| MINUTE_SECOND | %i%s | +| HOUR_MICROSECOND | %H%i%s%f | +| HOUR_SECOND | %H%i%s | +| HOUR_MINUTE | %H%i | +| DAY_MICROSECOND | %d%H%i%s%f | +| DAY_SECOND | %d%H%i%s | +| DAY_MINUTE | %d%H%i | +| DAY_HOUR | %d%H% | +| YEAR_MONTH | %V%m | + + +Return type: LONG +Example + +```ppl +source=people +| eval `extract(YEAR_MONTH FROM "2023-02-07 10:11:12")` = extract(YEAR_MONTH FROM "2023-02-07 10:11:12") +| fields `extract(YEAR_MONTH FROM "2023-02-07 10:11:12")` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++------------------------------------------------+ +| extract(YEAR_MONTH FROM "2023-02-07 10:11:12") | +|------------------------------------------------| +| 202302 | ++------------------------------------------------+ +``` + +## FROM_DAYS + +### Description + +Usage: from_days(N) returns the date value given the day number N. +Argument type: INTEGER/LONG +Return type: DATE +Example + +```ppl +source=people +| eval `FROM_DAYS(733687)` = FROM_DAYS(733687) +| fields `FROM_DAYS(733687)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------+ +| FROM_DAYS(733687) | +|-------------------| +| 2008-10-07 | ++-------------------+ +``` + +## FROM_UNIXTIME + +### Description + +Usage: Returns a representation of the argument given as a timestamp or character string value. Perform reverse conversion for [UNIX_TIMESTAMP](#unix_timestamp) function. +If second argument is provided, it is used to format the result in the same way as the format string used for the [DATE_FORMAT](#date_format) function. +If timestamp is outside of range 1970-01-01 00:00:00 - 3001-01-18 23:59:59.999999 (0 to 32536771199.999999 epoch time), function returns NULL. +Argument type: DOUBLE, STRING +Return type map: +DOUBLE -> TIMESTAMP +DOUBLE, STRING -> STRING +Examples + +```ppl +source=people +| eval `FROM_UNIXTIME(1220249547)` = FROM_UNIXTIME(1220249547) +| fields `FROM_UNIXTIME(1220249547)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------------------+ +| FROM_UNIXTIME(1220249547) | +|---------------------------| +| 2008-09-01 06:12:27 | ++---------------------------+ +``` + +```ppl +source=people +| eval `FROM_UNIXTIME(1220249547, '%T')` = FROM_UNIXTIME(1220249547, '%T') +| fields `FROM_UNIXTIME(1220249547, '%T')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------------------------+ +| FROM_UNIXTIME(1220249547, '%T') | +|---------------------------------| +| 06:12:27 | ++---------------------------------+ +``` + +## GET_FORMAT + +### Description + +Usage: Returns a string value containing string format specifiers based on the input arguments. +Argument type: TYPE, STRING, where TYPE must be one of the following tokens: [DATE, TIME, TIMESTAMP], and +STRING must be one of the following tokens: ["USA", "JIS", "ISO", "EUR", "INTERNAL"] (" can be replaced by '). +Examples + +```ppl +source=people +| eval `GET_FORMAT(DATE, 'USA')` = GET_FORMAT(DATE, 'USA') +| fields `GET_FORMAT(DATE, 'USA')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------------+ +| GET_FORMAT(DATE, 'USA') | +|-------------------------| +| %m.%d.%Y | ++-------------------------+ +``` + +## HOUR + +### Description + +Usage: hour(time) extracts the hour value for time. Different from the time of day value, the time value has a large range and can be greater than 23, so the return value of hour(time) can be also greater than 23. +Argument type: STRING/TIME/TIMESTAMP +Return type: INTEGER +Synonyms: [HOUR_OF_DAY](#hour_of_day) +Example + +```ppl +source=people +| eval `HOUR(TIME('01:02:03'))` = HOUR(TIME('01:02:03')) +| fields `HOUR(TIME('01:02:03'))` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++------------------------+ +| HOUR(TIME('01:02:03')) | +|------------------------| +| 1 | ++------------------------+ +``` + +## HOUR_OF_DAY + +### Description + +Usage: hour_of_day(time) extracts the hour value for time. Different from the time of day value, the time value has a large range and can be greater than 23, so the return value of hour_of_day(time) can be also greater than 23. +Argument type: STRING/TIME/TIMESTAMP +Return type: INTEGER +Synonyms: [HOUR](#hour) +Example + +```ppl +source=people +| eval `HOUR_OF_DAY(TIME('01:02:03'))` = HOUR_OF_DAY(TIME('01:02:03')) +| fields `HOUR_OF_DAY(TIME('01:02:03'))` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------------------+ +| HOUR_OF_DAY(TIME('01:02:03')) | +|-------------------------------| +| 1 | ++-------------------------------+ +``` + +## LAST_DAY + +Usage: Returns the last day of the month as a DATE for a valid argument. +Argument type: DATE/STRING/TIMESTAMP/TIME +Return type: DATE +Example + +```ppl +source=people +| eval `last_day('2023-02-06')` = last_day('2023-02-06') +| fields `last_day('2023-02-06')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++------------------------+ +| last_day('2023-02-06') | +|------------------------| +| 2023-02-28 | ++------------------------+ +``` + +## LOCALTIMESTAMP + +### Description + +`LOCALTIMESTAMP()` are synonyms for [NOW()](#now). +Example + +```ppl ignore +source=people +| eval `LOCALTIMESTAMP()` = LOCALTIMESTAMP() +| fields `LOCALTIMESTAMP()` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------------+ +| LOCALTIMESTAMP() | +|---------------------+ +| 2025-08-02 15:54:19 | ++---------------------+ +``` + +## LOCALTIME + +### Description + +`LOCALTIME()` are synonyms for [NOW()](#now). +Example + +```ppl ignore +source=people +| eval `LOCALTIME()` = LOCALTIME() +| fields `LOCALTIME()` +``` + +Expected output: + +```text ignore +fetched rows / total rows = 1/1 ++---------------------+ +| LOCALTIME() | +|---------------------+ +| 2025-08-02 15:54:19 | ++---------------------+ +``` + +## MAKEDATE + +### Description + +Returns a date, given `year` and `day-of-year` values. `dayofyear` must be greater than 0 or the result is `NULL`. The result is also `NULL` if either argument is `NULL`. +Arguments are rounded to an integer. +Limitations: +- Zero `year` interpreted as 2000; +- Negative `year` is not accepted; +- `day-of-year` should be greater than zero; +- `day-of-year` could be greater than 365/366, calculation switches to the next year(s) (see example). + +Specifications: +1. MAKEDATE(DOUBLE, DOUBLE) -> DATE + +Argument type: DOUBLE +Return type: DATE +Example + +```ppl +source=people +| eval `MAKEDATE(1945, 5.9)` = MAKEDATE(1945, 5.9), `MAKEDATE(1984, 1984)` = MAKEDATE(1984, 1984) +| fields `MAKEDATE(1945, 5.9)`, `MAKEDATE(1984, 1984)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------------+----------------------+ +| MAKEDATE(1945, 5.9) | MAKEDATE(1984, 1984) | +|---------------------+----------------------| +| 1945-01-06 | 1989-06-06 | ++---------------------+----------------------+ +``` + +## MAKETIME + +### Description + +Returns a time value calculated from the hour, minute, and second arguments. Returns `NULL` if any of its arguments are `NULL`. +The second argument can have a fractional part, rest arguments are rounded to an integer. +Limitations: +- 24-hour clock is used, available time range is [00:00:00.0 - 23:59:59.(9)]; +- Up to 9 digits of second fraction part is taken (nanosecond precision). + +Specifications: +1. MAKETIME(DOUBLE, DOUBLE, DOUBLE) -> TIME + +Argument type: DOUBLE +Return type: TIME +Example + +```ppl +source=people +| eval `MAKETIME(20, 30, 40)` = MAKETIME(20, 30, 40), `MAKETIME(20.2, 49.5, 42.100502)` = MAKETIME(20.2, 49.5, 42.100502) +| fields `MAKETIME(20, 30, 40)`, `MAKETIME(20.2, 49.5, 42.100502)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------------+---------------------------------+ +| MAKETIME(20, 30, 40) | MAKETIME(20.2, 49.5, 42.100502) | +|----------------------+---------------------------------| +| 20:30:40 | 20:50:42.100502 | ++----------------------+---------------------------------+ +``` + +## MICROSECOND + +### Description + +Usage: microsecond(expr) returns the microseconds from the time or timestamp expression expr as a number in the range from 0 to 999999. +Argument type: STRING/TIME/TIMESTAMP +Return type: INTEGER +Example + +```ppl +source=people +| eval `MICROSECOND(TIME('01:02:03.123456'))` = MICROSECOND(TIME('01:02:03.123456')) +| fields `MICROSECOND(TIME('01:02:03.123456'))` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------------------------+ +| MICROSECOND(TIME('01:02:03.123456')) | +|--------------------------------------| +| 123456 | ++--------------------------------------+ +``` + +## MINUTE + +### Description + +Usage: minute(time) returns the minute for time, in the range 0 to 59. +Argument type: STRING/TIME/TIMESTAMP +Return type: INTEGER +Synonyms: [MINUTE_OF_HOUR](#minute_of_hour) +Example + +```ppl +source=people +| eval `MINUTE(TIME('01:02:03'))` = MINUTE(TIME('01:02:03')) +| fields `MINUTE(TIME('01:02:03'))` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------------+ +| MINUTE(TIME('01:02:03')) | +|--------------------------| +| 2 | ++--------------------------+ +``` + +## MINUTE_OF_DAY + +### Description + +Usage: minute(time) returns the amount of minutes in the day, in the range of 0 to 1439. +Argument type: STRING/TIME/TIMESTAMP +Return type: INTEGER +Example + +```ppl +source=people +| eval `MINUTE_OF_DAY(TIME('01:02:03'))` = MINUTE_OF_DAY(TIME('01:02:03')) +| fields `MINUTE_OF_DAY(TIME('01:02:03'))` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------------------------+ +| MINUTE_OF_DAY(TIME('01:02:03')) | +|---------------------------------| +| 62 | ++---------------------------------+ +``` + +## MINUTE_OF_HOUR + +### Description + +Usage: minute(time) returns the minute for time, in the range 0 to 59. +Argument type: STRING/TIME/TIMESTAMP +Return type: INTEGER +Synonyms: [MINUTE](#minute) +Example + +```ppl +source=people +| eval `MINUTE_OF_HOUR(TIME('01:02:03'))` = MINUTE_OF_HOUR(TIME('01:02:03')) +| fields `MINUTE_OF_HOUR(TIME('01:02:03'))` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------------------------+ +| MINUTE_OF_HOUR(TIME('01:02:03')) | +|----------------------------------| +| 2 | ++----------------------------------+ +``` + +## MONTH + +### Description + +Usage: month(date) returns the month for date, in the range 1 to 12 for January to December. +Argument type: STRING/DATE/TIMESTAMP +Return type: INTEGER +Synonyms: [MONTH_OF_YEAR](#month_of_year) +Example + +```ppl +source=people +| eval `MONTH(DATE('2020-08-26'))` = MONTH(DATE('2020-08-26')) +| fields `MONTH(DATE('2020-08-26'))` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------------------+ +| MONTH(DATE('2020-08-26')) | +|---------------------------| +| 8 | ++---------------------------+ +``` + +## MONTH_OF_YEAR + +### Description + +Usage: month_of_year(date) returns the month for date, in the range 1 to 12 for January to December. +Argument type: STRING/DATE/TIMESTAMP +Return type: INTEGER +Synonyms: [MONTH](#month) +Example + +```ppl +source=people +| eval `MONTH_OF_YEAR(DATE('2020-08-26'))` = MONTH_OF_YEAR(DATE('2020-08-26')) +| fields `MONTH_OF_YEAR(DATE('2020-08-26'))` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------------------+ +| MONTH_OF_YEAR(DATE('2020-08-26')) | +|-----------------------------------| +| 8 | ++-----------------------------------+ +``` + +## MONTHNAME + +### Description + +Usage: monthname(date) returns the full name of the month for date. +Argument type: STRING/DATE/TIMESTAMP +Return type: STRING +Example + +```ppl +source=people +| eval `MONTHNAME(DATE('2020-08-26'))` = MONTHNAME(DATE('2020-08-26')) +| fields `MONTHNAME(DATE('2020-08-26'))` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------------------+ +| MONTHNAME(DATE('2020-08-26')) | +|-------------------------------| +| August | ++-------------------------------+ +``` + +## NOW + +### Description + +Returns the current date and time as a value in 'YYYY-MM-DD hh:mm:ss' format. The value is expressed in the UTC time zone. +`NOW()` returns a constant time that indicates the time at which the statement began to execute. This differs from the behavior for [SYSDATE()](#sysdate), which returns the exact time at which it executes. +Return type: TIMESTAMP +Specification: NOW() -> TIMESTAMP +Example + +```ppl ignore +source=people +| eval `value_1` = NOW(), `value_2` = NOW() +| fields `value_1`, `value_2` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------------+---------------------+ +| value_1 | value_2 | +|---------------------+---------------------| +| 2025-08-02 15:39:05 | 2025-08-02 15:39:05 | ++---------------------+---------------------+ +``` + +## PERIOD_ADD + +### Description + +Usage: period_add(P, N) add N months to period P (in the format YYMM or YYYYMM). Returns a value in the format YYYYMM. +Argument type: INTEGER, INTEGER +Return type: INTEGER +Example + +```ppl +source=people +| eval `PERIOD_ADD(200801, 2)` = PERIOD_ADD(200801, 2), `PERIOD_ADD(200801, -12)` = PERIOD_ADD(200801, -12) +| fields `PERIOD_ADD(200801, 2)`, `PERIOD_ADD(200801, -12)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------+-------------------------+ +| PERIOD_ADD(200801, 2) | PERIOD_ADD(200801, -12) | +|-----------------------+-------------------------| +| 200803 | 200701 | ++-----------------------+-------------------------+ +``` + +## PERIOD_DIFF + +### Description + +Usage: period_diff(P1, P2) returns the number of months between periods P1 and P2 given in the format YYMM or YYYYMM. +Argument type: INTEGER, INTEGER +Return type: INTEGER +Example + +```ppl +source=people +| eval `PERIOD_DIFF(200802, 200703)` = PERIOD_DIFF(200802, 200703), `PERIOD_DIFF(200802, 201003)` = PERIOD_DIFF(200802, 201003) +| fields `PERIOD_DIFF(200802, 200703)`, `PERIOD_DIFF(200802, 201003)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------------+-----------------------------+ +| PERIOD_DIFF(200802, 200703) | PERIOD_DIFF(200802, 201003) | +|-----------------------------+-----------------------------| +| 11 | -25 | ++-----------------------------+-----------------------------+ +``` + +## QUARTER + +### Description + +Usage: quarter(date) returns the quarter of the year for date, in the range 1 to 4. +Argument type: STRING/DATE/TIMESTAMP +Return type: INTEGER +Example + +```ppl +source=people +| eval `QUARTER(DATE('2020-08-26'))` = QUARTER(DATE('2020-08-26')) +| fields `QUARTER(DATE('2020-08-26'))` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------------+ +| QUARTER(DATE('2020-08-26')) | +|-----------------------------| +| 3 | ++-----------------------------+ +``` + +## SEC_TO_TIME + +### Description + +Usage: sec_to_time(number) returns the time in HH:mm:ssss[.nnnnnn] format. +Note that the function returns a time between 00:00:00 and 23:59:59. +If an input value is too large (greater than 86399), the function will wrap around and begin returning outputs starting from 00:00:00. +If an input value is too small (less than 0), the function will wrap around and begin returning outputs counting down from 23:59:59. +Argument type: INTEGER, LONG, DOUBLE, FLOAT +Return type: TIME +Example + +```ppl +source=people +| eval `SEC_TO_TIME(3601)` = SEC_TO_TIME(3601) +| eval `SEC_TO_TIME(1234.123)` = SEC_TO_TIME(1234.123) +| fields `SEC_TO_TIME(3601)`, `SEC_TO_TIME(1234.123)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------+-----------------------+ +| SEC_TO_TIME(3601) | SEC_TO_TIME(1234.123) | +|-------------------+-----------------------| +| 01:00:01 | 00:20:34.123 | ++-------------------+-----------------------+ +``` + +## SECOND + +### Description + +Usage: second(time) returns the second for time, in the range 0 to 59. +Argument type: STRING/TIME/TIMESTAMP +Return type: INTEGER +Synonyms: [SECOND_OF_MINUTE](#second_of_minute) +Example + +```ppl +source=people +| eval `SECOND(TIME('01:02:03'))` = SECOND(TIME('01:02:03')) +| fields `SECOND(TIME('01:02:03'))` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------------+ +| SECOND(TIME('01:02:03')) | +|--------------------------| +| 3 | ++--------------------------+ +``` + +## SECOND_OF_MINUTE + +### Description + +Usage: second_of_minute(time) returns the second for time, in the range 0 to 59. +Argument type: STRING/TIME/TIMESTAMP +Return type: INTEGER +Synonyms: [SECOND](#second) +Example + +```ppl +source=people +| eval `SECOND_OF_MINUTE(TIME('01:02:03'))` = SECOND_OF_MINUTE(TIME('01:02:03')) +| fields `SECOND_OF_MINUTE(TIME('01:02:03'))` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++------------------------------------+ +| SECOND_OF_MINUTE(TIME('01:02:03')) | +|------------------------------------| +| 3 | ++------------------------------------+ +``` + +## STRFTIME + +**Version: 3.3.0** +### Description + +Usage: strftime(time, format) takes a UNIX timestamp (in seconds) and renders it as a string using the format specified. For numeric inputs, the UNIX time must be in seconds. Values greater than 100000000000 are automatically treated as milliseconds and converted to seconds. +You can use time format variables with the strftime function. This function performs the reverse operation of [UNIX_TIMESTAMP](#unix_timestamp) and is similar to [FROM_UNIXTIME](#from_unixtime) but with POSIX-style format specifiers. + - **Available only when Calcite engine is enabled** + - All timestamps are interpreted as UTC timezone + - Text formatting uses language-neutral Locale.ROOT (weekday and month names appear in abbreviated form) + - String inputs are NOT supported - use `unix_timestamp()` to convert strings first + - Functions that return date/time values (like `date()`, `now()`, `timestamp()`) are supported + +Argument type: INTEGER/LONG/DOUBLE/TIMESTAMP, STRING +Return type: STRING +Format specifiers: +The following table describes the available specifier arguments. + + +| Specifier | Description | +| --- | --- | +| %a | Abbreviated weekday name (Mon..Sun) | +| %A | Weekday name (Mon..Sun) - Note: Locale.ROOT uses abbreviated form | +| %b | Abbreviated month name (Jan..Dec) | +| %B | Month name (Jan..Dec) - Note: Locale.ROOT uses abbreviated form | +| %c | Date and time (e.g., Mon Jul 18 09:30:00 2019) | +| %C | Century as 2-digit decimal number | +| %d | Day of the month, zero-padded (01..31) | +| %e | Day of the month, space-padded ( 1..31) | +| %Ez | Timezone offset in minutes from UTC (e.g., +0 for UTC, +330 for IST, -300 for EST) | +| %f | Microseconds as decimal number (000000..999999) | +| %F | ISO 8601 date format (%Y-%m-%d) | +| %g | ISO 8601 year without century (00..99) | +| %G | ISO 8601 year with century | +| %H | Hour (24-hour clock) (00..23) | +| %I | Hour (12-hour clock) (01..12) | +| %j | Day of year (001..366) | +| %k | Hour (24-hour clock), space-padded ( 0..23) | +| %m | Month as decimal number (01..12) | +| %M | Minute (00..59) | +| %N | Subsecond digits (default %9N = nanoseconds). Accepts any precision value from 1-9 (e.g., %3N = 3 digits, %5N = 5 digits, %9N = 9 digits). The precision directly controls the number of digits displayed | +| %p | AM or PM | +| %Q | Subsecond component (default milliseconds). Can specify precision: %3Q = milliseconds, %6Q = microseconds, %9Q = nanoseconds. Other precision values (e.g., %5Q) default to %3Q | +| %s | UNIX Epoch timestamp in seconds | +| %S | Second (00..59) | +| %T | Time in 24-hour notation (%H:%M:%S) | +| %U | Week of year starting from 0 (00..53) | +| %V | ISO week number (01..53) | +| %w | Weekday as decimal (0=Sunday..6=Saturday) | +| %x | Date in MM/dd/yyyy format (e.g., 07/13/2019) | +| %X | Time in HH:mm:ss format (e.g., 09:30:00) | +| %y | Year without century (00..99) | +| %Y | Year with century | +| %z | Timezone offset (+hhmm or -hhmm) | +| %:z | Timezone offset with colon (+hh:mm or -hh:mm) | +| %::z | Timezone offset with colons (+hh:mm:ss) | +| %:::z | Timezone offset hour only (+hh or -hh) | +| %Z | Timezone abbreviation (e.g., EST, PDT) | +| %% | Literal % character | + + +Examples + +```ppl ignore +source=people | eval `strftime(1521467703, "%Y-%m-%dT%H:%M:%S")` = strftime(1521467703, "%Y-%m-%dT%H:%M:%S") | fields `strftime(1521467703, "%Y-%m-%dT%H:%M:%S")` +``` + +```text +fetched rows / total rows = 1/1 ++-------------------------------------------+ +| strftime(1521467703, "%Y-%m-%dT%H:%M:%S") | +|-------------------------------------------| +| 2018-03-19T13:55:03 | ++-------------------------------------------+ +``` + +```ppl ignore +source=people | eval `strftime(1521467703, "%F %T")` = strftime(1521467703, "%F %T") | fields `strftime(1521467703, "%F %T")` +``` + +```text +fetched rows / total rows = 1/1 ++-------------------------------------------+ +| strftime(1521467703, "%Y-%m-%dT%H:%M:%S") | +|-------------------------------------------| +| 2018-03-19T13:55:03 | ++-------------------------------------------+ +``` + +```ppl ignore +source=people | eval `strftime(1521467703, "%a %b %d, %Y")` = strftime(1521467703, "%a %b %d, %Y") | fields `strftime(1521467703, "%a %b %d, %Y")` +``` + +```text +fetched rows / total rows = 1/1 ++--------------------------------------+ +| strftime(1521467703, "%a %b %d, %Y") | +|--------------------------------------| +| Mon Mar 19, 2018 | ++--------------------------------------+ +``` + +```ppl ignore +source=people | eval `strftime(1521467703, "%%Y")` = strftime(1521467703, "%%Y") | fields `strftime(1521467703, "%%Y")` +``` + +```text +fetched rows / total rows = 1/1 ++---------------------------+ +| strftime(1521467703, "%%Y") | +|---------------------------| +| %Y | ++---------------------------+ +``` + +```ppl ignore +source=people | eval `strftime(date('2020-09-16'), "%Y-%m-%d")` = strftime(date('2020-09-16'), "%Y-%m-%d") | fields `strftime(date('2020-09-16'), "%Y-%m-%d")` +```text + +fetched rows / total rows = 1/1 ++----------------------------------------+ +| strftime(date('2020-09-16'), "%Y-%m-%d") | +|-----------------------------------------| +| 2020-09-16 | + ++----------------------------------------+ + +``` +```ppl ignore + +source=people | eval `strftime(timestamp('2020-09-16 14:30:00'), "%F %T")` = strftime(timestamp('2020-09-16 14:30:00'), "%F %T") | fields `strftime(timestamp('2020-09-16 14:30:00'), "%F %T")` + +``` +```text + +fetched rows / total rows = 1/1 ++--------------------------------------------------+ +| strftime(timestamp('2020-09-16 14:30:00'), "%F %T") | +|---------------------------------------------------| +| 2020-09-16 14:30:00 | + ++--------------------------------------------------+ + +``` +```ppl ignore + +source=people | eval `strftime(now(), "%Y-%m-%d %H:%M:%S")` = strftime(now(), "%Y-%m-%d %H:%M:%S") | fields `strftime(now(), "%Y-%m-%d %H:%M:%S")` + +``` +```text + +fetched rows / total rows = 1/1 ++------------------------------------+ +| strftime(now(), "%Y-%m-%d %H:%M:%S") | +|-------------------------------------| +| 2025-09-03 12:30:45 | + ++------------------------------------+ + +``` +## STR_TO_DATE + +### Description + +Usage: str_to_date(string, string) is used to extract a TIMESTAMP from the first argument string using the formats specified in the second argument string. +The input argument must have enough information to be parsed as a DATE, TIMESTAMP, or TIME. +Acceptable string format specifiers are the same as those used in the [DATE_FORMAT](#date_format) function. +It returns NULL when a statement cannot be parsed due to an invalid pair of arguments, and when 0 is provided for any DATE field. Otherwise, it will return a TIMESTAMP with the parsed values (as well as default values for any field that was not parsed). +Argument type: STRING, STRING +Return type: TIMESTAMP +Example + +```ppl + +source=people +| eval `str_to_date("01,5,2013", "%d,%m,%Y")` = str_to_date("01,5,2013", "%d,%m,%Y") +| fields `str_to_date("01,5,2013", "%d,%m,%Y")` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++--------------------------------------+ +| str_to_date("01,5,2013", "%d,%m,%Y") | +|--------------------------------------| +| 2013-05-01 00:00:00 | + ++--------------------------------------+ + +``` + +## SUBDATE + +### Description + +Usage: subdate(date, INTERVAL expr unit) / subdate(date, days) subtracts the interval expr from date; subdate(date, days) subtracts the second argument as integer number of days from date. +If first argument is TIME, today's date is used; if first argument is DATE, time at midnight is used. +Argument type: DATE/TIMESTAMP/TIME, INTERVAL/LONG +Return type map: +(DATE/TIMESTAMP/TIME, INTERVAL) -> TIMESTAMP +(DATE, LONG) -> DATE +(TIMESTAMP/TIME, LONG) -> TIMESTAMP +Synonyms: [DATE_SUB](#date_sub) when invoked with the INTERVAL form of the second argument. +Antonyms: [ADDDATE](#adddate) +Example + +```ppl + +source=people +| eval `'2008-01-02' - 31d` = SUBDATE(DATE('2008-01-02'), INTERVAL 31 DAY), `'2020-08-26' - 1` = SUBDATE(DATE('2020-08-26'), 1), `ts '2020-08-26 01:01:01' - 1` = SUBDATE(TIMESTAMP('2020-08-26 01:01:01'), 1) +| fields `'2008-01-02' - 31d`, `'2020-08-26' - 1`, `ts '2020-08-26 01:01:01' - 1` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++---------------------+------------------+------------------------------+ +| '2008-01-02' - 31d | '2020-08-26' - 1 | ts '2020-08-26 01:01:01' - 1 | +|---------------------+------------------+------------------------------| +| 2007-12-02 00:00:00 | 2020-08-25 | 2020-08-25 01:01:01 | + ++---------------------+------------------+------------------------------+ + +``` + +## SUBTIME + +### Description + +Usage: subtime(expr1, expr2) subtracts expr2 from expr1 and returns the result. If argument is TIME, today's date is used; if argument is DATE, time at midnight is used. +Argument type: DATE/TIMESTAMP/TIME, DATE/TIMESTAMP/TIME +Return type map: +(DATE/TIMESTAMP, DATE/TIMESTAMP/TIME) -> TIMESTAMP +(TIME, DATE/TIMESTAMP/TIME) -> TIME +Antonyms: [ADDTIME](#addtime) +Example + +```ppl + +source=people +| eval `'2008-12-12' - 0` = SUBTIME(DATE('2008-12-12'), DATE('2008-11-15')) +| fields `'2008-12-12' - 0` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++---------------------+ +| '2008-12-12' - 0 | +|---------------------| +| 2008-12-12 00:00:00 | + ++---------------------+ + +``` + +```ppl + +source=people +| eval `'23:59:59' - 0` = SUBTIME(TIME('23:59:59'), DATE('2004-01-01')) +| fields `'23:59:59' - 0` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++----------------+ +| '23:59:59' - 0 | +|----------------| +| 23:59:59 | + ++----------------+ + +``` + +```ppl + +source=people +| eval `'2004-01-01' - '23:59:59'` = SUBTIME(DATE('2004-01-01'), TIME('23:59:59')) +| fields `'2004-01-01' - '23:59:59'` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++---------------------------+ +| '2004-01-01' - '23:59:59' | +|---------------------------| +| 2003-12-31 00:00:01 | + ++---------------------------+ + +``` + +```ppl + +source=people +| eval `'10:20:30' - '00:05:42'` = SUBTIME(TIME('10:20:30'), TIME('00:05:42')) +| fields `'10:20:30' - '00:05:42'` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++-------------------------+ +| '10:20:30' - '00:05:42' | +|-------------------------| +| 10:14:48 | + ++-------------------------+ + +``` + +```ppl + +source=people +| eval `'2007-03-01 10:20:30' - '20:40:50'` = SUBTIME(TIMESTAMP('2007-03-01 10:20:30'), TIMESTAMP('2002-03-04 20:40:50')) +| fields `'2007-03-01 10:20:30' - '20:40:50'` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++------------------------------------+ +| '2007-03-01 10:20:30' - '20:40:50' | +|------------------------------------| +| 2007-02-28 13:39:40 | + ++------------------------------------+ + +``` + +## SYSDATE + +### Description + +Returns the current date and time as a value in 'YYYY-MM-DD hh:mm:ss[.nnnnnn]'. +SYSDATE() returns the date and time at which it executes in UTC. This differs from the behavior for [NOW()](#now), which returns a constant time that indicates the time at which the statement began to execute. +If an argument is given, it specifies a fractional seconds precision from 0 to 6, the return value includes a fractional seconds part of that many digits. +Optional argument type: INTEGER +Return type: TIMESTAMP +Specification: SYSDATE([INTEGER]) -> TIMESTAMP +Example + +```ppl ignore + +source=people +| eval `value_1` = SYSDATE(), `value_2` = SYSDATE(6) +| fields `value_1`, `value_2` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++---------------------+----------------------------+ +| value_1 | value_2 | +|---------------------+----------------------------| +| 2025-08-02 15:39:05 | 2025-08-02 15:39:05.123456 | + ++---------------------+----------------------------+ + +``` + +## TIME + +### Description + +Usage: time(expr) constructs a time type with the input string expr as a time. If the argument is of date/time/timestamp, it extracts the time value part from the expression. +Argument type: STRING/DATE/TIME/TIMESTAMP +Return type: TIME +Example + +```ppl + +source=people +| eval `TIME('13:49:00')` = TIME('13:49:00') +| fields `TIME('13:49:00')` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++------------------+ +| TIME('13:49:00') | +|------------------| +| 13:49:00 | + ++------------------+ + +``` + +```ppl + +source=people +| eval `TIME('13:49')` = TIME('13:49') +| fields `TIME('13:49')` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++---------------+ +| TIME('13:49') | +|---------------| +| 13:49:00 | + ++---------------+ + +``` + +```ppl + +source=people +| eval `TIME('2020-08-26 13:49:00')` = TIME('2020-08-26 13:49:00') +| fields `TIME('2020-08-26 13:49:00')` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++-----------------------------+ +| TIME('2020-08-26 13:49:00') | +|-----------------------------| +| 13:49:00 | + ++-----------------------------+ + +``` + +```ppl + +source=people +| eval `TIME('2020-08-26 13:49')` = TIME('2020-08-26 13:49') +| fields `TIME('2020-08-26 13:49')` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++--------------------------+ +| TIME('2020-08-26 13:49') | +|--------------------------| +| 13:49:00 | + ++--------------------------+ + +``` + +## TIME_FORMAT + +### Description + +Usage: time_format(time, format) formats the time argument using the specifiers in the format argument. +This supports a subset of the time format specifiers available for the [date_format](#date_format) function. +Using date format specifiers supported by [date_format](#date_format) will return 0 or null. +Acceptable format specifiers are listed in the table below. +If an argument of type DATE is passed in, it is treated as a TIMESTAMP at midnight (i.e., 00:00:00). +The following table describes the available specifier arguments. + + +| Specifier | Description | +| --- | --- | +| %f | Microseconds (000000..999999) | +| %H | Hour (00..23) | +| %h | Hour (01..12) | +| %I | Hour (01..12) | +| %i | Minutes, numeric (00..59) | +| %p | AM or PM | +| %r | Time, 12-hour (hh:mm:ss followed by AM or PM) | +| %S | Seconds (00..59) | +| %s | Seconds (00..59) | +| %T | Time, 24-hour (hh:mm:ss) | + + +Argument type: STRING/DATE/TIME/TIMESTAMP, STRING +Return type: STRING +Example + +```ppl + +source=people +| eval `TIME_FORMAT('1998-01-31 13:14:15.012345', '%f %H %h %I %i %p %r %S %s %T')` = TIME_FORMAT('1998-01-31 13:14:15.012345', '%f %H %h %I %i %p %r %S %s %T') +| fields `TIME_FORMAT('1998-01-31 13:14:15.012345', '%f %H %h %I %i %p %r %S %s %T')` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++----------------------------------------------------------------------------+ +| TIME_FORMAT('1998-01-31 13:14:15.012345', '%f %H %h %I %i %p %r %S %s %T') | +|----------------------------------------------------------------------------| +| 012345 13 01 01 14 PM 01:14:15 PM 15 15 13:14:15 | + ++----------------------------------------------------------------------------+ + +``` + +## TIME_TO_SEC + +### Description + +Usage: time_to_sec(time) returns the time argument, converted to seconds. +Argument type: STRING/TIME/TIMESTAMP +Return type: LONG +Example + +```ppl + +source=people +| eval `TIME_TO_SEC(TIME('22:23:00'))` = TIME_TO_SEC(TIME('22:23:00')) +| fields `TIME_TO_SEC(TIME('22:23:00'))` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++-------------------------------+ +| TIME_TO_SEC(TIME('22:23:00')) | +|-------------------------------| +| 80580 | + ++-------------------------------+ + +``` + +## TIMEDIFF + +### Description + +Usage: returns the difference between two time expressions as a time. +Argument type: TIME, TIME +Return type: TIME +Example + +```ppl + +source=people +| eval `TIMEDIFF('23:59:59', '13:00:00')` = TIMEDIFF('23:59:59', '13:00:00') +| fields `TIMEDIFF('23:59:59', '13:00:00')` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++----------------------------------+ +| TIMEDIFF('23:59:59', '13:00:00') | +|----------------------------------| +| 10:59:59 | + ++----------------------------------+ + +``` + +## TIMESTAMP + +### Description + +Usage: timestamp(expr) constructs a timestamp type with the input string `expr` as an timestamp. If the argument is not a string, it casts `expr` to timestamp type with default timezone UTC. If argument is a time, it applies today's date before cast. +With two arguments `timestamp(expr1, expr2)` adds the time expression `expr2` to the date or timestamp expression `expr1` and returns the result as a timestamp value. +Argument type: STRING/DATE/TIME/TIMESTAMP +Return type map: +(STRING/DATE/TIME/TIMESTAMP) -> TIMESTAMP +(STRING/DATE/TIME/TIMESTAMP, STRING/DATE/TIME/TIMESTAMP) -> TIMESTAMP +Example + +```ppl + +source=people +| eval `TIMESTAMP('2020-08-26 13:49:00')` = TIMESTAMP('2020-08-26 13:49:00'), `TIMESTAMP('2020-08-26 13:49:00', TIME('12:15:42'))` = TIMESTAMP('2020-08-26 13:49:00', TIME('12:15:42')) +| fields `TIMESTAMP('2020-08-26 13:49:00')`, `TIMESTAMP('2020-08-26 13:49:00', TIME('12:15:42'))` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++----------------------------------+----------------------------------------------------+ +| TIMESTAMP('2020-08-26 13:49:00') | TIMESTAMP('2020-08-26 13:49:00', TIME('12:15:42')) | +|----------------------------------+----------------------------------------------------| +| 2020-08-26 13:49:00 | 2020-08-27 02:04:42 | + ++----------------------------------+----------------------------------------------------+ + +``` + +## TIMESTAMPADD + +### Description + +Usage: Returns a TIMESTAMP value based on a passed in DATE/TIME/TIMESTAMP/STRING argument and an INTERVAL and INTEGER argument which determine the amount of time to be added. +If the third argument is a STRING, it must be formatted as a valid TIMESTAMP. If only a TIME is provided, a TIMESTAMP is still returned with the DATE portion filled in using the current date. +If the third argument is a DATE, it will be automatically converted to a TIMESTAMP. +Argument type: INTERVAL, INTEGER, DATE/TIME/TIMESTAMP/STRING +INTERVAL must be one of the following tokens: [MICROSECOND, SECOND, MINUTE, HOUR, DAY, WEEK, MONTH, QUARTER, YEAR] +Examples + +```ppl + +source=people +| eval `TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00')` = TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00') +| eval `TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00')` = TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00') +| fields `TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00')`, `TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00')` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++----------------------------------------------+--------------------------------------------------+ +| TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00') | TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00') | +|----------------------------------------------+--------------------------------------------------| +| 2000-01-18 00:00:00 | 1999-10-01 00:00:00 | + ++----------------------------------------------+--------------------------------------------------+ + +``` + +## TIMESTAMPDIFF + +### Description + +Usage: TIMESTAMPDIFF(interval, start, end) returns the difference between the start and end date/times in interval units. +If a TIME is provided as an argument, it will be converted to a TIMESTAMP with the DATE portion filled in using the current date. +Arguments will be automatically converted to a TIME/TIMESTAMP when appropriate. +Any argument that is a STRING must be formatted as a valid TIMESTAMP. +Argument type: INTERVAL, DATE/TIME/TIMESTAMP/STRING, DATE/TIME/TIMESTAMP/STRING +INTERVAL must be one of the following tokens: [MICROSECOND, SECOND, MINUTE, HOUR, DAY, WEEK, MONTH, QUARTER, YEAR] +Examples + +```ppl + +source=people +| eval `TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00')` = TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00') +| eval `TIMESTAMPDIFF(SECOND, time('00:00:23'), time('00:00:00'))` = TIMESTAMPDIFF(SECOND, time('00:00:23'), time('00:00:00')) +| fields `TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00')`, `TIMESTAMPDIFF(SECOND, time('00:00:23'), time('00:00:00'))` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++-------------------------------------------------------------------+-----------------------------------------------------------+ +| TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00') | TIMESTAMPDIFF(SECOND, time('00:00:23'), time('00:00:00')) | +|-------------------------------------------------------------------+-----------------------------------------------------------| +| 4 | -23 | + ++-------------------------------------------------------------------+-----------------------------------------------------------+ + +``` + +## TO_DAYS + +### Description + +Usage: to_days(date) returns the day number (the number of days since year 0) of the given date. Returns NULL if date is invalid. +Argument type: STRING/DATE/TIMESTAMP +Return type: LONG +Example + +```ppl + +source=people +| eval `TO_DAYS(DATE('2008-10-07'))` = TO_DAYS(DATE('2008-10-07')) +| fields `TO_DAYS(DATE('2008-10-07'))` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++-----------------------------+ +| TO_DAYS(DATE('2008-10-07')) | +|-----------------------------| +| 733687 | + ++-----------------------------+ + +``` + +## TO_SECONDS + +### Description + +Usage: to_seconds(date) returns the number of seconds since the year 0 of the given value. Returns NULL if value is invalid. +An argument of a LONG type can be used. It must be formatted as YMMDD, YYMMDD, YYYMMDD or YYYYMMDD. Note that a LONG type argument cannot have leading 0s as it will be parsed using an octal numbering system. +Argument type: STRING/LONG/DATE/TIME/TIMESTAMP +Return type: LONG +Example + +```ppl + +source=people +| eval `TO_SECONDS(DATE('2008-10-07'))` = TO_SECONDS(DATE('2008-10-07')) +| eval `TO_SECONDS(950228)` = TO_SECONDS(950228) +| fields `TO_SECONDS(DATE('2008-10-07'))`, `TO_SECONDS(950228)` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++--------------------------------+--------------------+ +| TO_SECONDS(DATE('2008-10-07')) | TO_SECONDS(950228) | +|--------------------------------+--------------------| +| 63390556800 | 62961148800 | + ++--------------------------------+--------------------+ + +``` + +## UNIX_TIMESTAMP + +### Description + +Usage: Converts given argument to Unix time (seconds since Epoch - very beginning of year 1970). If no argument given, it returns the current Unix time. +The date argument may be a DATE, or TIMESTAMP string, or a number in YYMMDD, YYMMDDhhmmss, YYYYMMDD, or YYYYMMDDhhmmss format. If the argument includes a time part, it may optionally include a fractional seconds part. +If argument is in invalid format or outside of range 1970-01-01 00:00:00 - 3001-01-18 23:59:59.999999 (0 to 32536771199.999999 epoch time), function returns NULL. +You can use [FROM_UNIXTIME](#from_unixtime) to do reverse conversion. +Argument type: \/DOUBLE/DATE/TIMESTAMP +Return type: DOUBLE +Example + +```ppl + +source=people +| eval `UNIX_TIMESTAMP(double)` = UNIX_TIMESTAMP(20771122143845), `UNIX_TIMESTAMP(timestamp)` = UNIX_TIMESTAMP(TIMESTAMP('1996-11-15 17:05:42')) +| fields `UNIX_TIMESTAMP(double)`, `UNIX_TIMESTAMP(timestamp)` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++------------------------+---------------------------+ +| UNIX_TIMESTAMP(double) | UNIX_TIMESTAMP(timestamp) | +|------------------------+---------------------------| +| 3404817525.0 | 848077542.0 | + ++------------------------+---------------------------+ + +``` + +## UTC_DATE + +### Description + +Returns the current UTC date as a value in 'YYYY-MM-DD'. +Return type: DATE +Specification: UTC_DATE() -> DATE +Example + +```ppl ignore + +source=people +| eval `UTC_DATE()` = UTC_DATE() +| fields `UTC_DATE()` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++------------+ +| UTC_DATE() | +|------------| +| 2025-10-03 | + ++------------+ + +``` + +## UTC_TIME + +### Description + +Returns the current UTC time as a value in 'hh:mm:ss'. +Return type: TIME +Specification: UTC_TIME() -> TIME +Example + +```ppl ignore + +source=people +| eval `UTC_TIME()` = UTC_TIME() +| fields `UTC_TIME()` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++------------+ +| UTC_TIME() | +|------------| +| 17:54:27 | + ++------------+ + +``` + +## UTC_TIMESTAMP + +### Description + +Returns the current UTC timestamp as a value in 'YYYY-MM-DD hh:mm:ss'. +Return type: TIMESTAMP +Specification: UTC_TIMESTAMP() -> TIMESTAMP +Example + +```ppl ignore + +source=people +| eval `UTC_TIMESTAMP()` = UTC_TIMESTAMP() +| fields `UTC_TIMESTAMP()` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++---------------------+ +| UTC_TIMESTAMP() | +|---------------------| +| 2025-10-03 17:54:28 | + ++---------------------+ + +``` + +## WEEK + +### Description + +Usage: week(date[, mode]) returns the week number for date. If the mode argument is omitted, the default mode 0 is used. +The following table describes how the mode argument works. + + +| Mode | First day of week | Range | Week 1 is the first week ... | +| --- | --- | --- | --- | +| 0 | Sunday | 0-53 | with a Sunday in this year | +| 1 | Monday | 0-53 | with 4 or more days this year | +| 2 | Sunday | 1-53 | with a Sunday in this year | +| 3 | Monday | 1-53 | with 4 or more days this year | +| 4 | Sunday | 0-53 | with 4 or more days this year | +| 5 | Monday | 0-53 | with a Monday in this year | +| 6 | Sunday | 1-53 | with 4 or more days this year | +| 7 | Monday | 1-53 | with a Monday in this year | + + +Argument type: DATE/TIMESTAMP/STRING +Return type: INTEGER +Synonyms: [WEEK_OF_YEAR](#week_of_year) +Example + +```ppl + +source=people +| eval `WEEK(DATE('2008-02-20'))` = WEEK(DATE('2008-02-20')), `WEEK(DATE('2008-02-20'), 1)` = WEEK(DATE('2008-02-20'), 1) +| fields `WEEK(DATE('2008-02-20'))`, `WEEK(DATE('2008-02-20'), 1)` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++--------------------------+-----------------------------+ +| WEEK(DATE('2008-02-20')) | WEEK(DATE('2008-02-20'), 1) | +|--------------------------+-----------------------------| +| 7 | 8 | + ++--------------------------+-----------------------------+ + +``` + +## WEEKDAY + +### Description + +Usage: weekday(date) returns the weekday index for date (0 = Monday, 1 = Tuesday, ..., 6 = Sunday). +It is similar to the [dayofweek](#dayofweek) function, but returns different indexes for each day. +Argument type: STRING/DATE/TIME/TIMESTAMP +Return type: INTEGER +Example + +```ppl + +source=people +| eval `weekday(DATE('2020-08-26'))` = weekday(DATE('2020-08-26')) +| eval `weekday(DATE('2020-08-27'))` = weekday(DATE('2020-08-27')) +| fields `weekday(DATE('2020-08-26'))`, `weekday(DATE('2020-08-27'))` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++-----------------------------+-----------------------------+ +| weekday(DATE('2020-08-26')) | weekday(DATE('2020-08-27')) | +|-----------------------------+-----------------------------| +| 2 | 3 | + ++-----------------------------+-----------------------------+ + +``` + +## WEEK_OF_YEAR + +### Description + +Usage: week_of_year(date[, mode]) returns the week number for date. If the mode argument is omitted, the default mode 0 is used. +The following table describes how the mode argument works. + + +| Mode | First day of week | Range | Week 1 is the first week ... | +| --- | --- | --- | --- | +| 0 | Sunday | 0-53 | with a Sunday in this year | +| 1 | Monday | 0-53 | with 4 or more days this year | +| 2 | Sunday | 1-53 | with a Sunday in this year | +| 3 | Monday | 1-53 | with 4 or more days this year | +| 4 | Sunday | 0-53 | with 4 or more days this year | +| 5 | Monday | 0-53 | with a Monday in this year | +| 6 | Sunday | 1-53 | with 4 or more days this year | +| 7 | Monday | 1-53 | with a Monday in this year | + + +Argument type: DATE/TIMESTAMP/STRING +Return type: INTEGER +Synonyms: [WEEK](#week) +Example + +```ppl + +source=people +| eval `WEEK_OF_YEAR(DATE('2008-02-20'))` = WEEK(DATE('2008-02-20')), `WEEK_OF_YEAR(DATE('2008-02-20'), 1)` = WEEK_OF_YEAR(DATE('2008-02-20'), 1) +| fields `WEEK_OF_YEAR(DATE('2008-02-20'))`, `WEEK_OF_YEAR(DATE('2008-02-20'), 1)` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++----------------------------------+-------------------------------------+ +| WEEK_OF_YEAR(DATE('2008-02-20')) | WEEK_OF_YEAR(DATE('2008-02-20'), 1) | +|----------------------------------+-------------------------------------| +| 7 | 8 | + ++----------------------------------+-------------------------------------+ + +``` + +## YEAR + +### Description + +Usage: year(date) returns the year for date, in the range 1000 to 9999, or 0 for the “zero” date. +Argument type: STRING/DATE/TIMESTAMP +Return type: INTEGER +Example + +```ppl + +source=people +| eval `YEAR(DATE('2020-08-26'))` = YEAR(DATE('2020-08-26')) +| fields `YEAR(DATE('2020-08-26'))` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++--------------------------+ +| YEAR(DATE('2020-08-26')) | +|--------------------------| +| 2020 | + ++--------------------------+ + +``` + +## YEARWEEK + +### Description + +Usage: yearweek(date[, mode]) returns the year and week for date as an integer. It accepts and optional mode arguments aligned with those available for the [WEEK](#week) function. +Argument type: STRING/DATE/TIME/TIMESTAMP +Return type: INTEGER +Example + +```ppl + +source=people +| eval `YEARWEEK('2020-08-26')` = YEARWEEK('2020-08-26') +| eval `YEARWEEK('2019-01-05', 1)` = YEARWEEK('2019-01-05', 1) +| fields `YEARWEEK('2020-08-26')`, `YEARWEEK('2019-01-05', 1)` + +``` + +Expected output: + +```text + +fetched rows / total rows = 1/1 ++------------------------+---------------------------+ +| YEARWEEK('2020-08-26') | YEARWEEK('2019-01-05', 1) | +|------------------------+---------------------------| +| 202034 | 201901 | + ++------------------------+---------------------------+ + +``` + \ No newline at end of file diff --git a/docs/user/ppl/functions/datetime.rst b/docs/user/ppl/functions/datetime.rst deleted file mode 100644 index bd69425a2dd..00000000000 --- a/docs/user/ppl/functions/datetime.rst +++ /dev/null @@ -1,2360 +0,0 @@ -======================= -Date and Time Functions -======================= - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 1 - -.. note:: - - All PPL date and time functions use the UTC time zone. Both input and output values are interpreted as UTC. - For instance, an input timestamp literal like '2020-08-26 01:01:01' is assumed to be in UTC, and the now() - function also returns the current date and time in UTC. - -ADDDATE -------- - -Description ->>>>>>>>>>> - -Usage: adddate(date, INTERVAL expr unit) / adddate(date, days) adds the interval of second argument to date; adddate(date, days) adds the second argument as integer number of days to date. -If first argument is TIME, today's date is used; if first argument is DATE, time at midnight is used. - -Argument type: DATE/TIMESTAMP/TIME, INTERVAL/LONG - -Return type map: - -(DATE/TIMESTAMP/TIME, INTERVAL) -> TIMESTAMP - -(DATE, LONG) -> DATE - -(TIMESTAMP/TIME, LONG) -> TIMESTAMP - -Synonyms: `DATE_ADD`_ when invoked with the INTERVAL form of the second argument. - -Antonyms: `SUBDATE`_ - -Example:: - - os> source=people | eval `'2020-08-26' + 1h` = ADDDATE(DATE('2020-08-26'), INTERVAL 1 HOUR), `'2020-08-26' + 1` = ADDDATE(DATE('2020-08-26'), 1), `ts '2020-08-26 01:01:01' + 1` = ADDDATE(TIMESTAMP('2020-08-26 01:01:01'), 1) | fields `'2020-08-26' + 1h`, `'2020-08-26' + 1`, `ts '2020-08-26 01:01:01' + 1` - fetched rows / total rows = 1/1 - +---------------------+------------------+------------------------------+ - | '2020-08-26' + 1h | '2020-08-26' + 1 | ts '2020-08-26 01:01:01' + 1 | - |---------------------+------------------+------------------------------| - | 2020-08-26 01:00:00 | 2020-08-27 | 2020-08-27 01:01:01 | - +---------------------+------------------+------------------------------+ - - - -ADDTIME -------- - -Description ->>>>>>>>>>> - -Usage: addtime(expr1, expr2) adds expr2 to expr1 and returns the result. If argument is TIME, today's date is used; if argument is DATE, time at midnight is used. - -Argument type: DATE/TIMESTAMP/TIME, DATE/TIMESTAMP/TIME - -Return type map: - -(DATE/TIMESTAMP, DATE/TIMESTAMP/TIME) -> TIMESTAMP - -(TIME, DATE/TIMESTAMP/TIME) -> TIME - -Antonyms: `SUBTIME`_ - -Example:: - - os> source=people | eval `'2008-12-12' + 0` = ADDTIME(DATE('2008-12-12'), DATE('2008-11-15')) | fields `'2008-12-12' + 0` - fetched rows / total rows = 1/1 - +---------------------+ - | '2008-12-12' + 0 | - |---------------------| - | 2008-12-12 00:00:00 | - +---------------------+ - - os> source=people | eval `'23:59:59' + 0` = ADDTIME(TIME('23:59:59'), DATE('2004-01-01')) | fields `'23:59:59' + 0` - fetched rows / total rows = 1/1 - +----------------+ - | '23:59:59' + 0 | - |----------------| - | 23:59:59 | - +----------------+ - - os> source=people | eval `'2004-01-01' + '23:59:59'` = ADDTIME(DATE('2004-01-01'), TIME('23:59:59')) | fields `'2004-01-01' + '23:59:59'` - fetched rows / total rows = 1/1 - +---------------------------+ - | '2004-01-01' + '23:59:59' | - |---------------------------| - | 2004-01-01 23:59:59 | - +---------------------------+ - - os> source=people | eval `'10:20:30' + '00:05:42'` = ADDTIME(TIME('10:20:30'), TIME('00:05:42')) | fields `'10:20:30' + '00:05:42'` - fetched rows / total rows = 1/1 - +-------------------------+ - | '10:20:30' + '00:05:42' | - |-------------------------| - | 10:26:12 | - +-------------------------+ - - os> source=people | eval `'2007-02-28 10:20:30' + '20:40:50'` = ADDTIME(TIMESTAMP('2007-02-28 10:20:30'), TIMESTAMP('2002-03-04 20:40:50')) | fields `'2007-02-28 10:20:30' + '20:40:50'` - fetched rows / total rows = 1/1 - +------------------------------------+ - | '2007-02-28 10:20:30' + '20:40:50' | - |------------------------------------| - | 2007-03-01 07:01:20 | - +------------------------------------+ - - -CONVERT_TZ ----------- - -Description ->>>>>>>>>>> - -Usage: convert_tz(timestamp, from_timezone, to_timezone) constructs a local timestamp converted from the from_timezone to the to_timezone. CONVERT_TZ returns null when any of the three function arguments are invalid, i.e. timestamp is not in the format yyyy-MM-dd HH:mm:ss or the timeszone is not in (+/-)HH:mm. It also is invalid for invalid dates, such as February 30th and invalid timezones, which are ones outside of -13:59 and +14:00. - -Argument type: TIMESTAMP/STRING, STRING, STRING - -Return type: TIMESTAMP - -Conversion from +00:00 timezone to +10:00 timezone. Returns the timestamp argument converted from +00:00 to +10:00 -Example:: - - os> source=people | eval `convert_tz('2008-05-15 12:00:00','+00:00','+10:00')` = convert_tz('2008-05-15 12:00:00','+00:00','+10:00') | fields `convert_tz('2008-05-15 12:00:00','+00:00','+10:00')` - fetched rows / total rows = 1/1 - +-----------------------------------------------------+ - | convert_tz('2008-05-15 12:00:00','+00:00','+10:00') | - |-----------------------------------------------------| - | 2008-05-15 22:00:00 | - +-----------------------------------------------------+ - -The valid timezone range for convert_tz is (-13:59, +14:00) inclusive. Timezones outside of the range, such as +15:00 in this example will return null. -Example:: - - os> source=people | eval `convert_tz('2008-05-15 12:00:00','+00:00','+15:00')` = convert_tz('2008-05-15 12:00:00','+00:00','+15:00')| fields `convert_tz('2008-05-15 12:00:00','+00:00','+15:00')` - fetched rows / total rows = 1/1 - +-----------------------------------------------------+ - | convert_tz('2008-05-15 12:00:00','+00:00','+15:00') | - |-----------------------------------------------------| - | null | - +-----------------------------------------------------+ - -Conversion from a positive timezone to a negative timezone that goes over date line. -Example:: - - os> source=people | eval `convert_tz('2008-05-15 12:00:00','+03:30','-10:00')` = convert_tz('2008-05-15 12:00:00','+03:30','-10:00') | fields `convert_tz('2008-05-15 12:00:00','+03:30','-10:00')` - fetched rows / total rows = 1/1 - +-----------------------------------------------------+ - | convert_tz('2008-05-15 12:00:00','+03:30','-10:00') | - |-----------------------------------------------------| - | 2008-05-14 22:30:00 | - +-----------------------------------------------------+ - -Valid dates are required in convert_tz, invalid dates such as April 31st (not a date in the Gregorian calendar) will result in null. -Example:: - - os> source=people | eval `convert_tz('2008-04-31 12:00:00','+03:30','-10:00')` = convert_tz('2008-04-31 12:00:00','+03:30','-10:00') | fields `convert_tz('2008-04-31 12:00:00','+03:30','-10:00')` - fetched rows / total rows = 1/1 - +-----------------------------------------------------+ - | convert_tz('2008-04-31 12:00:00','+03:30','-10:00') | - |-----------------------------------------------------| - | null | - +-----------------------------------------------------+ - -Valid dates are required in convert_tz, invalid dates such as February 30th (not a date in the Gregorian calendar) will result in null. -Example:: - - os> source=people | eval `convert_tz('2008-02-30 12:00:00','+03:30','-10:00')` = convert_tz('2008-02-30 12:00:00','+03:30','-10:00') | fields `convert_tz('2008-02-30 12:00:00','+03:30','-10:00')` - fetched rows / total rows = 1/1 - +-----------------------------------------------------+ - | convert_tz('2008-02-30 12:00:00','+03:30','-10:00') | - |-----------------------------------------------------| - | null | - +-----------------------------------------------------+ - -February 29th 2008 is a valid date because it is a leap year. -Example:: - - os> source=people | eval `convert_tz('2008-02-29 12:00:00','+03:30','-10:00')` = convert_tz('2008-02-29 12:00:00','+03:30','-10:00') | fields `convert_tz('2008-02-29 12:00:00','+03:30','-10:00')` - fetched rows / total rows = 1/1 - +-----------------------------------------------------+ - | convert_tz('2008-02-29 12:00:00','+03:30','-10:00') | - |-----------------------------------------------------| - | 2008-02-28 22:30:00 | - +-----------------------------------------------------+ - -Valid dates are required in convert_tz, invalid dates such as February 29th 2007 (2007 is not a leap year) will result in null. -Example:: - - os> source=people | eval `convert_tz('2007-02-29 12:00:00','+03:30','-10:00')` = convert_tz('2007-02-29 12:00:00','+03:30','-10:00') | fields `convert_tz('2007-02-29 12:00:00','+03:30','-10:00')` - fetched rows / total rows = 1/1 - +-----------------------------------------------------+ - | convert_tz('2007-02-29 12:00:00','+03:30','-10:00') | - |-----------------------------------------------------| - | null | - +-----------------------------------------------------+ - -The valid timezone range for convert_tz is (-13:59, +14:00) inclusive. Timezones outside of the range, such as +14:01 in this example will return null. -Example:: - - os> source=people | eval `convert_tz('2008-02-01 12:00:00','+14:01','+00:00')` = convert_tz('2008-02-01 12:00:00','+14:01','+00:00') | fields `convert_tz('2008-02-01 12:00:00','+14:01','+00:00')` - fetched rows / total rows = 1/1 - +-----------------------------------------------------+ - | convert_tz('2008-02-01 12:00:00','+14:01','+00:00') | - |-----------------------------------------------------| - | null | - +-----------------------------------------------------+ - -The valid timezone range for convert_tz is (-13:59, +14:00) inclusive. Timezones outside of the range, such as +14:00 in this example will return a correctly converted date time object. -Example:: - - os> source=people | eval `convert_tz('2008-02-01 12:00:00','+14:00','+00:00')` = convert_tz('2008-02-01 12:00:00','+14:00','+00:00') | fields `convert_tz('2008-02-01 12:00:00','+14:00','+00:00')` - fetched rows / total rows = 1/1 - +-----------------------------------------------------+ - | convert_tz('2008-02-01 12:00:00','+14:00','+00:00') | - |-----------------------------------------------------| - | 2008-01-31 22:00:00 | - +-----------------------------------------------------+ - -The valid timezone range for convert_tz is (-13:59, +14:00) inclusive. Timezones outside of the range, such as -14:00 will result in null -Example:: - - os> source=people | eval `convert_tz('2008-02-01 12:00:00','-14:00','+00:00')` = convert_tz('2008-02-01 12:00:00','-14:00','+00:00') | fields `convert_tz('2008-02-01 12:00:00','-14:00','+00:00')` - fetched rows / total rows = 1/1 - +-----------------------------------------------------+ - | convert_tz('2008-02-01 12:00:00','-14:00','+00:00') | - |-----------------------------------------------------| - | null | - +-----------------------------------------------------+ - -The valid timezone range for convert_tz is (-13:59, +14:00) inclusive. This timezone is within range so it is valid and will convert the time. -Example:: - - os> source=people | eval `convert_tz('2008-02-01 12:00:00','-13:59','+00:00')` = convert_tz('2008-02-01 12:00:00','-13:59','+00:00') | fields `convert_tz('2008-02-01 12:00:00','-13:59','+00:00')` - fetched rows / total rows = 1/1 - +-----------------------------------------------------+ - | convert_tz('2008-02-01 12:00:00','-13:59','+00:00') | - |-----------------------------------------------------| - | 2008-02-02 01:59:00 | - +-----------------------------------------------------+ - - -CURDATE -------- - -Description ->>>>>>>>>>> - -Returns the current date as a value in 'YYYY-MM-DD' format. -CURDATE() returns the current date in UTC at the time the statement is executed. - - -Return type: DATE - -Specification: CURDATE() -> DATE - -Example:: - - > source=people | eval `CURDATE()` = CURDATE() | fields `CURDATE()` - fetched rows / total rows = 1/1 - +------------+ - | CURDATE() | - |------------| - | 2022-08-02 | - +------------+ - - -CURRENT_DATE ------------- - -Description ->>>>>>>>>>> - -`CURRENT_DATE()` is a synonym for `CURDATE() <#curdate>`_. - -Example:: - - > source=people | eval `CURRENT_DATE()` = CURRENT_DATE() | fields `CURRENT_DATE()` - fetched rows / total rows = 1/1 - +------------------+ - | CURRENT_DATE() | - |------------------+ - | 2022-08-02 | - +------------------+ - - -CURRENT_TIME ------------- - -Description ->>>>>>>>>>> - -`CURRENT_TIME()` is a synonym for `CURTIME() <#curtime>`_. - -Example:: - - > source=people | eval `CURRENT_TIME()` = CURRENT_TIME() | fields `CURRENT_TIME()` - fetched rows / total rows = 1/1 - +------------------+ - | CURRENT_TIME() | - |------------------+ - | 15:39:05 | - +------------------+ - - -CURRENT_TIMESTAMP ------------------ - -Description ->>>>>>>>>>> - -`CURRENT_TIMESTAMP()` is a synonym for `NOW() <#now>`_. - -Example:: - - > source=people | eval `CURRENT_TIMESTAMP()` = CURRENT_TIMESTAMP() | fields `CURRENT_TIMESTAMP()` - fetched rows / total rows = 1/1 - +-----------------------+ - | CURRENT_TIMESTAMP() | - |-----------------------+ - | 2022-08-02 15:54:19 | - +-----------------------+ - - -CURTIME -------- - -Description ->>>>>>>>>>> - -Returns the current time as a value in 'hh:mm:ss' format in the UTC time zone. -CURTIME() returns the time at which the statement began to execute as `NOW() <#now>`_ does. - -Return type: TIME - -Specification: CURTIME() -> TIME - -Example:: - - > source=people | eval `value_1` = CURTIME(), `value_2` = CURTIME() | fields `value_1`, `value_2` - fetched rows / total rows = 1/1 - +----------+----------+ - | value_1 | value_2 | - |----------+----------| - | 15:39:05 | 15:39:05 | - +----------+----------+ - - -DATE ----- - -Description ->>>>>>>>>>> - -Usage: date(expr) constructs a date type with the input string expr as a date. If the argument is of date/timestamp, it extracts the date value part from the expression. - -Argument type: STRING/DATE/TIMESTAMP - -Return type: DATE - -Example:: - - os> source=people | eval `DATE('2020-08-26')` = DATE('2020-08-26') | fields `DATE('2020-08-26')` - fetched rows / total rows = 1/1 - +--------------------+ - | DATE('2020-08-26') | - |--------------------| - | 2020-08-26 | - +--------------------+ - - os> source=people | eval `DATE(TIMESTAMP('2020-08-26 13:49:00'))` = DATE(TIMESTAMP('2020-08-26 13:49:00')) | fields `DATE(TIMESTAMP('2020-08-26 13:49:00'))` - fetched rows / total rows = 1/1 - +----------------------------------------+ - | DATE(TIMESTAMP('2020-08-26 13:49:00')) | - |----------------------------------------| - | 2020-08-26 | - +----------------------------------------+ - - os> source=people | eval `DATE('2020-08-26 13:49')` = DATE('2020-08-26 13:49') | fields `DATE('2020-08-26 13:49')` - fetched rows / total rows = 1/1 - +--------------------------+ - | DATE('2020-08-26 13:49') | - |--------------------------| - | 2020-08-26 | - +--------------------------+ - - os> source=people | eval `DATE('2020-08-26 13:49')` = DATE('2020-08-26 13:49') | fields `DATE('2020-08-26 13:49')` - fetched rows / total rows = 1/1 - +--------------------------+ - | DATE('2020-08-26 13:49') | - |--------------------------| - | 2020-08-26 | - +--------------------------+ - - -DATE_ADD --------- - -Description ->>>>>>>>>>> - -Usage: date_add(date, INTERVAL expr unit) adds the interval expr to date. If first argument is TIME, today's date is used; if first argument is DATE, time at midnight is used. - -Argument type: DATE/TIMESTAMP/TIME, INTERVAL - -Return type: TIMESTAMP - -Synonyms: `ADDDATE`_ - -Antonyms: `DATE_SUB`_ - -Example:: - - os> source=people | eval `'2020-08-26' + 1h` = DATE_ADD(DATE('2020-08-26'), INTERVAL 1 HOUR), `ts '2020-08-26 01:01:01' + 1d` = DATE_ADD(TIMESTAMP('2020-08-26 01:01:01'), INTERVAL 1 DAY) | fields `'2020-08-26' + 1h`, `ts '2020-08-26 01:01:01' + 1d` - fetched rows / total rows = 1/1 - +---------------------+-------------------------------+ - | '2020-08-26' + 1h | ts '2020-08-26 01:01:01' + 1d | - |---------------------+-------------------------------| - | 2020-08-26 01:00:00 | 2020-08-27 01:01:01 | - +---------------------+-------------------------------+ - - -DATE_FORMAT ------------ - -Description ->>>>>>>>>>> - -Usage: date_format(date, format) formats the date argument using the specifiers in the format argument. -If an argument of type TIME is provided, the local date is used. - -.. list-table:: The following table describes the available specifier arguments. - :widths: 20 80 - :header-rows: 1 - - * - Specifier - - Description - * - %a - - Abbreviated weekday name (Sun..Sat) - * - %b - - Abbreviated month name (Jan..Dec) - * - %c - - Month, numeric (0..12) - * - %D - - Day of the month with English suffix (0th, 1st, 2nd, 3rd, ...) - * - %d - - Day of the month, numeric (00..31) - * - %e - - Day of the month, numeric (0..31) - * - %f - - Microseconds (000000..999999) - * - %H - - Hour (00..23) - * - %h - - Hour (01..12) - * - %I - - Hour (01..12) - * - %i - - Minutes, numeric (00..59) - * - %j - - Day of year (001..366) - * - %k - - Hour (0..23) - * - %l - - Hour (1..12) - * - %M - - Month name (January..December) - * - %m - - Month, numeric (00..12) - * - %p - - AM or PM - * - %r - - Time, 12-hour (hh:mm:ss followed by AM or PM) - * - %S - - Seconds (00..59) - * - %s - - Seconds (00..59) - * - %T - - Time, 24-hour (hh:mm:ss) - * - %U - - Week (00..53), where Sunday is the first day of the week; WEEK() mode 0 - * - %u - - Week (00..53), where Monday is the first day of the week; WEEK() mode 1 - * - %V - - Week (01..53), where Sunday is the first day of the week; WEEK() mode 2; used with %X - * - %v - - Week (01..53), where Monday is the first day of the week; WEEK() mode 3; used with %x - * - %W - - Weekday name (Sunday..Saturday) - * - %w - - Day of the week (0=Sunday..6=Saturday) - * - %X - - Year for the week where Sunday is the first day of the week, numeric, four digits; used with %V - * - %x - - Year for the week, where Monday is the first day of the week, numeric, four digits; used with %v - * - %Y - - Year, numeric, four digits - * - %y - - Year, numeric (two digits) - * - %% - - A literal % character - * - %x - - x, for any “x” not listed above - * - x - - x, for any smallcase/uppercase alphabet except [aydmshiHIMYDSEL] - -Argument type: STRING/DATE/TIME/TIMESTAMP, STRING - -Return type: STRING - -Example:: - - os> source=people | eval `DATE_FORMAT('1998-01-31 13:14:15.012345', '%T.%f')` = DATE_FORMAT('1998-01-31 13:14:15.012345', '%T.%f'), `DATE_FORMAT(TIMESTAMP('1998-01-31 13:14:15.012345'), '%Y-%b-%D %r')` = DATE_FORMAT(TIMESTAMP('1998-01-31 13:14:15.012345'), '%Y-%b-%D %r') | fields `DATE_FORMAT('1998-01-31 13:14:15.012345', '%T.%f')`, `DATE_FORMAT(TIMESTAMP('1998-01-31 13:14:15.012345'), '%Y-%b-%D %r')` - fetched rows / total rows = 1/1 - +----------------------------------------------------+---------------------------------------------------------------------+ - | DATE_FORMAT('1998-01-31 13:14:15.012345', '%T.%f') | DATE_FORMAT(TIMESTAMP('1998-01-31 13:14:15.012345'), '%Y-%b-%D %r') | - |----------------------------------------------------+---------------------------------------------------------------------| - | 13:14:15.012345 | 1998-Jan-31st 01:14:15 PM | - +----------------------------------------------------+---------------------------------------------------------------------+ - - -DATETIME --------- - -Description ->>>>>>>>>>> - -Usage: DATETIME(timestamp)/ DATETIME(date, to_timezone) Converts the datetime to a new timezone - -Argument type: timestamp/STRING - -Return type map: - -(TIMESTAMP, STRING) -> TIMESTAMP - -(TIMESTAMP) -> TIMESTAMP - - -Converting timestamp with timezone to the second argument timezone. -Example:: - - os> source=people | eval `DATETIME('2004-02-28 23:00:00-10:00', '+10:00')` = DATETIME('2004-02-28 23:00:00-10:00', '+10:00') | fields `DATETIME('2004-02-28 23:00:00-10:00', '+10:00')` - fetched rows / total rows = 1/1 - +-------------------------------------------------+ - | DATETIME('2004-02-28 23:00:00-10:00', '+10:00') | - |-------------------------------------------------| - | 2004-02-29 19:00:00 | - +-------------------------------------------------+ - - -The valid timezone range for convert_tz is (-13:59, +14:00) inclusive. Timezones outside of the range will result in null. -Example:: - - os> source=people | eval `DATETIME('2008-01-01 02:00:00', '-14:00')` = DATETIME('2008-01-01 02:00:00', '-14:00') | fields `DATETIME('2008-01-01 02:00:00', '-14:00')` - fetched rows / total rows = 1/1 - +-------------------------------------------+ - | DATETIME('2008-01-01 02:00:00', '-14:00') | - |-------------------------------------------| - | null | - +-------------------------------------------+ - - -DATE_SUB --------- - -Description ->>>>>>>>>>> - -Usage: date_sub(date, INTERVAL expr unit) subtracts the interval expr from date. If first argument is TIME, today's date is used; if first argument is DATE, time at midnight is used. - -Argument type: DATE/TIMESTAMP/TIME, INTERVAL - -Return type: TIMESTAMP - -Synonyms: `SUBDATE`_ - -Antonyms: `DATE_ADD`_ - -Example:: - - os> source=people | eval `'2008-01-02' - 31d` = DATE_SUB(DATE('2008-01-02'), INTERVAL 31 DAY), `ts '2020-08-26 01:01:01' + 1h` = DATE_SUB(TIMESTAMP('2020-08-26 01:01:01'), INTERVAL 1 HOUR) | fields `'2008-01-02' - 31d`, `ts '2020-08-26 01:01:01' + 1h` - fetched rows / total rows = 1/1 - +---------------------+-------------------------------+ - | '2008-01-02' - 31d | ts '2020-08-26 01:01:01' + 1h | - |---------------------+-------------------------------| - | 2007-12-02 00:00:00 | 2020-08-26 00:01:01 | - +---------------------+-------------------------------+ - - -DATEDIFF --------- - -Usage: Calculates the difference of date parts of given values. If the first argument is time, today's date is used. - -Argument type: DATE/TIMESTAMP/TIME, DATE/TIMESTAMP/TIME - -Return type: LONG - -Example:: - - os> source=people | eval `'2000-01-02' - '2000-01-01'` = DATEDIFF(TIMESTAMP('2000-01-02 00:00:00'), TIMESTAMP('2000-01-01 23:59:59')), `'2001-02-01' - '2004-01-01'` = DATEDIFF(DATE('2001-02-01'), TIMESTAMP('2004-01-01 00:00:00')), `today - today` = DATEDIFF(TIME('23:59:59'), TIME('00:00:00')) | fields `'2000-01-02' - '2000-01-01'`, `'2001-02-01' - '2004-01-01'`, `today - today` - fetched rows / total rows = 1/1 - +-----------------------------+-----------------------------+---------------+ - | '2000-01-02' - '2000-01-01' | '2001-02-01' - '2004-01-01' | today - today | - |-----------------------------+-----------------------------+---------------| - | 1 | -1064 | 0 | - +-----------------------------+-----------------------------+---------------+ - - -DAY ---- - -Description ->>>>>>>>>>> - -Usage: day(date) extracts the day of the month for date, in the range 1 to 31. - -Argument type: STRING/DATE/TIMESTAMP - -Return type: INTEGER - -Synonyms: `DAYOFMONTH`_, `DAY_OF_MONTH`_ - -Example:: - - os> source=people | eval `DAY(DATE('2020-08-26'))` = DAY(DATE('2020-08-26')) | fields `DAY(DATE('2020-08-26'))` - fetched rows / total rows = 1/1 - +-------------------------+ - | DAY(DATE('2020-08-26')) | - |-------------------------| - | 26 | - +-------------------------+ - - -DAYNAME -------- - -Description ->>>>>>>>>>> - -Usage: dayname(date) returns the name of the weekday for date, including Monday, Tuesday, Wednesday, Thursday, Friday, Saturday and Sunday. - -Argument type: STRING/DATE/TIMESTAMP - -Return type: STRING - -Example:: - - os> source=people | eval `DAYNAME(DATE('2020-08-26'))` = DAYNAME(DATE('2020-08-26')) | fields `DAYNAME(DATE('2020-08-26'))` - fetched rows / total rows = 1/1 - +-----------------------------+ - | DAYNAME(DATE('2020-08-26')) | - |-----------------------------| - | Wednesday | - +-----------------------------+ - - -DAYOFMONTH ----------- - -Description ->>>>>>>>>>> - -Usage: dayofmonth(date) extracts the day of the month for date, in the range 1 to 31. - -Argument type: STRING/DATE/TIMESTAMP - -Return type: INTEGER - -Synonyms: `DAY`_, `DAY_OF_MONTH`_ - -Example:: - - os> source=people | eval `DAYOFMONTH(DATE('2020-08-26'))` = DAYOFMONTH(DATE('2020-08-26')) | fields `DAYOFMONTH(DATE('2020-08-26'))` - fetched rows / total rows = 1/1 - +--------------------------------+ - | DAYOFMONTH(DATE('2020-08-26')) | - |--------------------------------| - | 26 | - +--------------------------------+ - - -DAY_OF_MONTH ------------- - -Description ->>>>>>>>>>> - -Usage: day_of_month(date) extracts the day of the month for date, in the range 1 to 31. - -Argument type: STRING/DATE/TIMESTAMP - -Return type: INTEGER - -Synonyms: `DAY`_, `DAYOFMONTH`_ - -Example:: - - os> source=people | eval `DAY_OF_MONTH(DATE('2020-08-26'))` = DAY_OF_MONTH(DATE('2020-08-26')) | fields `DAY_OF_MONTH(DATE('2020-08-26'))` - fetched rows / total rows = 1/1 - +----------------------------------+ - | DAY_OF_MONTH(DATE('2020-08-26')) | - |----------------------------------| - | 26 | - +----------------------------------+ - - -DAYOFWEEK ---------- - -Description ->>>>>>>>>>> - -Usage: dayofweek(date) returns the weekday index for date (1 = Sunday, 2 = Monday, ..., 7 = Saturday). - -Argument type: STRING/DATE/TIMESTAMP - -Return type: INTEGER - -Synonyms: `DAY_OF_WEEK`_ - -Example:: - - os> source=people | eval `DAYOFWEEK(DATE('2020-08-26'))` = DAYOFWEEK(DATE('2020-08-26')) | fields `DAYOFWEEK(DATE('2020-08-26'))` - fetched rows / total rows = 1/1 - +-------------------------------+ - | DAYOFWEEK(DATE('2020-08-26')) | - |-------------------------------| - | 4 | - +-------------------------------+ - - -DAY_OF_WEEK ------------ - -Description ->>>>>>>>>>> - -Usage: day_of_week(date) returns the weekday index for date (1 = Sunday, 2 = Monday, ..., 7 = Saturday). - -Argument type: STRING/DATE/TIMESTAMP - -Return type: INTEGER - -Synonyms: `DAYOFWEEK`_ - -Example:: - - os> source=people | eval `DAY_OF_WEEK(DATE('2020-08-26'))` = DAY_OF_WEEK(DATE('2020-08-26')) | fields `DAY_OF_WEEK(DATE('2020-08-26'))` - fetched rows / total rows = 1/1 - +---------------------------------+ - | DAY_OF_WEEK(DATE('2020-08-26')) | - |---------------------------------| - | 4 | - +---------------------------------+ - - -DAYOFYEAR ---------- - -Description ->>>>>>>>>>> - -Usage: dayofyear(date) returns the day of the year for date, in the range 1 to 366. - -Argument type: STRING/DATE/TIMESTAMP - -Return type: INTEGER - -Synonyms: `DAY_OF_YEAR`_ - -Example:: - - os> source=people | eval `DAYOFYEAR(DATE('2020-08-26'))` = DAYOFYEAR(DATE('2020-08-26')) | fields `DAYOFYEAR(DATE('2020-08-26'))` - fetched rows / total rows = 1/1 - +-------------------------------+ - | DAYOFYEAR(DATE('2020-08-26')) | - |-------------------------------| - | 239 | - +-------------------------------+ - - -DAY_OF_YEAR ------------ - -Description ->>>>>>>>>>> - -Usage: day_of_year(date) returns the day of the year for date, in the range 1 to 366. - -Argument type: STRING/DATE/TIMESTAMP - -Return type: INTEGER - -Synonyms: `DAYOFYEAR`_ - -Example:: - - os> source=people | eval `DAY_OF_YEAR(DATE('2020-08-26'))` = DAY_OF_YEAR(DATE('2020-08-26')) | fields `DAY_OF_YEAR(DATE('2020-08-26'))` - fetched rows / total rows = 1/1 - +---------------------------------+ - | DAY_OF_YEAR(DATE('2020-08-26')) | - |---------------------------------| - | 239 | - +---------------------------------+ - - -EXTRACT -------- - -Description ->>>>>>>>>>> - -Usage: extract(part FROM date) returns a LONG with digits in order according to the given 'part' arguments. -The specific format of the returned long is determined by the table below. - -Argument type: PART, where PART is one of the following tokens in the table below. - -The format specifiers found in this table are the same as those found in the `DATE_FORMAT`_ function. - -.. list-table:: The following table describes the mapping of a 'part' to a particular format. - :widths: 20 80 - :header-rows: 1 - - * - Part - - Format - * - MICROSECOND - - %f - * - SECOND - - %s - * - MINUTE - - %i - * - HOUR - - %H - * - DAY - - %d - * - WEEK - - %X - * - MONTH - - %m - * - YEAR - - %V - * - SECOND_MICROSECOND - - %s%f - * - MINUTE_MICROSECOND - - %i%s%f - * - MINUTE_SECOND - - %i%s - * - HOUR_MICROSECOND - - %H%i%s%f - * - HOUR_SECOND - - %H%i%s - * - HOUR_MINUTE - - %H%i - * - DAY_MICROSECOND - - %d%H%i%s%f - * - DAY_SECOND - - %d%H%i%s - * - DAY_MINUTE - - %d%H%i - * - DAY_HOUR - - %d%H% - * - YEAR_MONTH - - %V%m - -Return type: LONG - -Example:: - - os> source=people | eval `extract(YEAR_MONTH FROM "2023-02-07 10:11:12")` = extract(YEAR_MONTH FROM "2023-02-07 10:11:12") | fields `extract(YEAR_MONTH FROM "2023-02-07 10:11:12")` - fetched rows / total rows = 1/1 - +------------------------------------------------+ - | extract(YEAR_MONTH FROM "2023-02-07 10:11:12") | - |------------------------------------------------| - | 202302 | - +------------------------------------------------+ - - -FROM_DAYS ---------- - -Description ->>>>>>>>>>> - -Usage: from_days(N) returns the date value given the day number N. - -Argument type: INTEGER/LONG - -Return type: DATE - -Example:: - - os> source=people | eval `FROM_DAYS(733687)` = FROM_DAYS(733687) | fields `FROM_DAYS(733687)` - fetched rows / total rows = 1/1 - +-------------------+ - | FROM_DAYS(733687) | - |-------------------| - | 2008-10-07 | - +-------------------+ - - -FROM_UNIXTIME -------------- - -Description ->>>>>>>>>>> - -Usage: Returns a representation of the argument given as a timestamp or character string value. Perform reverse conversion for `UNIX_TIMESTAMP`_ function. -If second argument is provided, it is used to format the result in the same way as the format string used for the `DATE_FORMAT`_ function. -If timestamp is outside of range 1970-01-01 00:00:00 - 3001-01-18 23:59:59.999999 (0 to 32536771199.999999 epoch time), function returns NULL. -Argument type: DOUBLE, STRING - -Return type map: - -DOUBLE -> TIMESTAMP - -DOUBLE, STRING -> STRING - -Examples:: - - os> source=people | eval `FROM_UNIXTIME(1220249547)` = FROM_UNIXTIME(1220249547) | fields `FROM_UNIXTIME(1220249547)` - fetched rows / total rows = 1/1 - +---------------------------+ - | FROM_UNIXTIME(1220249547) | - |---------------------------| - | 2008-09-01 06:12:27 | - +---------------------------+ - - os> source=people | eval `FROM_UNIXTIME(1220249547, '%T')` = FROM_UNIXTIME(1220249547, '%T') | fields `FROM_UNIXTIME(1220249547, '%T')` - fetched rows / total rows = 1/1 - +---------------------------------+ - | FROM_UNIXTIME(1220249547, '%T') | - |---------------------------------| - | 06:12:27 | - +---------------------------------+ - - -GET_FORMAT ----------- - -Description ->>>>>>>>>>> - -Usage: Returns a string value containing string format specifiers based on the input arguments. - -Argument type: TYPE, STRING, where TYPE must be one of the following tokens: [DATE, TIME, TIMESTAMP], and -STRING must be one of the following tokens: ["USA", "JIS", "ISO", "EUR", "INTERNAL"] (" can be replaced by '). - -Examples:: - - os> source=people | eval `GET_FORMAT(DATE, 'USA')` = GET_FORMAT(DATE, 'USA') | fields `GET_FORMAT(DATE, 'USA')` - fetched rows / total rows = 1/1 - +-------------------------+ - | GET_FORMAT(DATE, 'USA') | - |-------------------------| - | %m.%d.%Y | - +-------------------------+ - - -HOUR ----- - -Description ->>>>>>>>>>> - -Usage: hour(time) extracts the hour value for time. Different from the time of day value, the time value has a large range and can be greater than 23, so the return value of hour(time) can be also greater than 23. - -Argument type: STRING/TIME/TIMESTAMP - -Return type: INTEGER - -Synonyms: `HOUR_OF_DAY`_ - -Example:: - - os> source=people | eval `HOUR(TIME('01:02:03'))` = HOUR(TIME('01:02:03')) | fields `HOUR(TIME('01:02:03'))` - fetched rows / total rows = 1/1 - +------------------------+ - | HOUR(TIME('01:02:03')) | - |------------------------| - | 1 | - +------------------------+ - - -HOUR_OF_DAY ------------ - -Description ->>>>>>>>>>> - -Usage: hour_of_day(time) extracts the hour value for time. Different from the time of day value, the time value has a large range and can be greater than 23, so the return value of hour_of_day(time) can be also greater than 23. - -Argument type: STRING/TIME/TIMESTAMP - -Return type: INTEGER - -Synonyms: `HOUR`_ - -Example:: - - os> source=people | eval `HOUR_OF_DAY(TIME('01:02:03'))` = HOUR_OF_DAY(TIME('01:02:03')) | fields `HOUR_OF_DAY(TIME('01:02:03'))` - fetched rows / total rows = 1/1 - +-------------------------------+ - | HOUR_OF_DAY(TIME('01:02:03')) | - |-------------------------------| - | 1 | - +-------------------------------+ - - -LAST_DAY --------- - -Usage: Returns the last day of the month as a DATE for a valid argument. - -Argument type: DATE/STRING/TIMESTAMP/TIME - -Return type: DATE - -Example:: - - os> source=people | eval `last_day('2023-02-06')` = last_day('2023-02-06') | fields `last_day('2023-02-06')` - fetched rows / total rows = 1/1 - +------------------------+ - | last_day('2023-02-06') | - |------------------------| - | 2023-02-28 | - +------------------------+ - - -LOCALTIMESTAMP --------------- - -Description ->>>>>>>>>>> - -`LOCALTIMESTAMP()` are synonyms for `NOW() <#now>`_. - -Example:: - - > source=people | eval `LOCALTIMESTAMP()` = LOCALTIMESTAMP() | fields `LOCALTIMESTAMP()` - fetched rows / total rows = 1/1 - +---------------------+ - | LOCALTIMESTAMP() | - |---------------------+ - | 2022-08-02 15:54:19 | - +---------------------+ - - -LOCALTIME ---------- - -Description ->>>>>>>>>>> - -`LOCALTIME()` are synonyms for `NOW() <#now>`_. - -Example:: - - > source=people | eval `LOCALTIME()` = LOCALTIME() | fields `LOCALTIME()` - fetched rows / total rows = 1/1 - +---------------------+ - | LOCALTIME() | - |---------------------+ - | 2022-08-02 15:54:19 | - +---------------------+ - - -MAKEDATE --------- - -Description ->>>>>>>>>>> - -Returns a date, given `year` and `day-of-year` values. `dayofyear` must be greater than 0 or the result is `NULL`. The result is also `NULL` if either argument is `NULL`. -Arguments are rounded to an integer. - -Limitations: -- Zero `year` interpreted as 2000; -- Negative `year` is not accepted; -- `day-of-year` should be greater than zero; -- `day-of-year` could be greater than 365/366, calculation switches to the next year(s) (see example). - -Specifications: - -1. MAKEDATE(DOUBLE, DOUBLE) -> DATE - -Argument type: DOUBLE - -Return type: DATE - -Example:: - - os> source=people | eval `MAKEDATE(1945, 5.9)` = MAKEDATE(1945, 5.9), `MAKEDATE(1984, 1984)` = MAKEDATE(1984, 1984) | fields `MAKEDATE(1945, 5.9)`, `MAKEDATE(1984, 1984)` - fetched rows / total rows = 1/1 - +---------------------+----------------------+ - | MAKEDATE(1945, 5.9) | MAKEDATE(1984, 1984) | - |---------------------+----------------------| - | 1945-01-06 | 1989-06-06 | - +---------------------+----------------------+ - - -MAKETIME --------- - -Description ->>>>>>>>>>> - -Returns a time value calculated from the hour, minute, and second arguments. Returns `NULL` if any of its arguments are `NULL`. -The second argument can have a fractional part, rest arguments are rounded to an integer. - -Limitations: -- 24-hour clock is used, available time range is [00:00:00.0 - 23:59:59.(9)]; -- Up to 9 digits of second fraction part is taken (nanosecond precision). - -Specifications: - -1. MAKETIME(DOUBLE, DOUBLE, DOUBLE) -> TIME - -Argument type: DOUBLE - -Return type: TIME - -Example:: - - os> source=people | eval `MAKETIME(20, 30, 40)` = MAKETIME(20, 30, 40), `MAKETIME(20.2, 49.5, 42.100502)` = MAKETIME(20.2, 49.5, 42.100502) | fields `MAKETIME(20, 30, 40)`, `MAKETIME(20.2, 49.5, 42.100502)` - fetched rows / total rows = 1/1 - +----------------------+---------------------------------+ - | MAKETIME(20, 30, 40) | MAKETIME(20.2, 49.5, 42.100502) | - |----------------------+---------------------------------| - | 20:30:40 | 20:50:42.100502 | - +----------------------+---------------------------------+ - - -MICROSECOND ------------ - -Description ->>>>>>>>>>> - -Usage: microsecond(expr) returns the microseconds from the time or timestamp expression expr as a number in the range from 0 to 999999. - -Argument type: STRING/TIME/TIMESTAMP - -Return type: INTEGER - -Example:: - - os> source=people | eval `MICROSECOND(TIME('01:02:03.123456'))` = MICROSECOND(TIME('01:02:03.123456')) | fields `MICROSECOND(TIME('01:02:03.123456'))` - fetched rows / total rows = 1/1 - +--------------------------------------+ - | MICROSECOND(TIME('01:02:03.123456')) | - |--------------------------------------| - | 123456 | - +--------------------------------------+ - - -MINUTE ------- - -Description ->>>>>>>>>>> - -Usage: minute(time) returns the minute for time, in the range 0 to 59. - -Argument type: STRING/TIME/TIMESTAMP - -Return type: INTEGER - -Synonyms: `MINUTE_OF_HOUR`_ - -Example:: - - os> source=people | eval `MINUTE(TIME('01:02:03'))` = MINUTE(TIME('01:02:03')) | fields `MINUTE(TIME('01:02:03'))` - fetched rows / total rows = 1/1 - +--------------------------+ - | MINUTE(TIME('01:02:03')) | - |--------------------------| - | 2 | - +--------------------------+ - - -MINUTE_OF_DAY -------------- - -Description ->>>>>>>>>>> - -Usage: minute(time) returns the amount of minutes in the day, in the range of 0 to 1439. - -Argument type: STRING/TIME/TIMESTAMP - -Return type: INTEGER - -Example:: - - os> source=people | eval `MINUTE_OF_DAY(TIME('01:02:03'))` = MINUTE_OF_DAY(TIME('01:02:03')) | fields `MINUTE_OF_DAY(TIME('01:02:03'))` - fetched rows / total rows = 1/1 - +---------------------------------+ - | MINUTE_OF_DAY(TIME('01:02:03')) | - |---------------------------------| - | 62 | - +---------------------------------+ - - -MINUTE_OF_HOUR --------------- - -Description ->>>>>>>>>>> - -Usage: minute(time) returns the minute for time, in the range 0 to 59. - -Argument type: STRING/TIME/TIMESTAMP - -Return type: INTEGER - -Synonyms: `MINUTE`_ - -Example:: - - os> source=people | eval `MINUTE_OF_HOUR(TIME('01:02:03'))` = MINUTE_OF_HOUR(TIME('01:02:03')) | fields `MINUTE_OF_HOUR(TIME('01:02:03'))` - fetched rows / total rows = 1/1 - +----------------------------------+ - | MINUTE_OF_HOUR(TIME('01:02:03')) | - |----------------------------------| - | 2 | - +----------------------------------+ - - -MONTH ------ - -Description ->>>>>>>>>>> - -Usage: month(date) returns the month for date, in the range 1 to 12 for January to December. - -Argument type: STRING/DATE/TIMESTAMP - -Return type: INTEGER - -Synonyms: `MONTH_OF_YEAR`_ - -Example:: - - os> source=people | eval `MONTH(DATE('2020-08-26'))` = MONTH(DATE('2020-08-26')) | fields `MONTH(DATE('2020-08-26'))` - fetched rows / total rows = 1/1 - +---------------------------+ - | MONTH(DATE('2020-08-26')) | - |---------------------------| - | 8 | - +---------------------------+ - - -MONTH_OF_YEAR -------------- - -Description ->>>>>>>>>>> - -Usage: month_of_year(date) returns the month for date, in the range 1 to 12 for January to December. - -Argument type: STRING/DATE/TIMESTAMP - -Return type: INTEGER - -Synonyms: `MONTH`_ - -Example:: - - os> source=people | eval `MONTH_OF_YEAR(DATE('2020-08-26'))` = MONTH_OF_YEAR(DATE('2020-08-26')) | fields `MONTH_OF_YEAR(DATE('2020-08-26'))` - fetched rows / total rows = 1/1 - +-----------------------------------+ - | MONTH_OF_YEAR(DATE('2020-08-26')) | - |-----------------------------------| - | 8 | - +-----------------------------------+ - - -MONTHNAME ---------- - -Description ->>>>>>>>>>> - -Usage: monthname(date) returns the full name of the month for date. - -Argument type: STRING/DATE/TIMESTAMP - -Return type: STRING - -Example:: - - os> source=people | eval `MONTHNAME(DATE('2020-08-26'))` = MONTHNAME(DATE('2020-08-26')) | fields `MONTHNAME(DATE('2020-08-26'))` - fetched rows / total rows = 1/1 - +-------------------------------+ - | MONTHNAME(DATE('2020-08-26')) | - |-------------------------------| - | August | - +-------------------------------+ - - -NOW ---- - -Description ->>>>>>>>>>> - -Returns the current date and time as a value in 'YYYY-MM-DD hh:mm:ss' format. The value is expressed in the UTC time zone. -`NOW()` returns a constant time that indicates the time at which the statement began to execute. This differs from the behavior for `SYSDATE() <#sysdate>`_, which returns the exact time at which it executes. - -Return type: TIMESTAMP - -Specification: NOW() -> TIMESTAMP - -Example:: - - > source=people | eval `value_1` = NOW(), `value_2` = NOW() | fields `value_1`, `value_2` - fetched rows / total rows = 1/1 - +---------------------+---------------------+ - | value_1 | value_2 | - |---------------------+---------------------| - | 2022-08-02 15:39:05 | 2022-08-02 15:39:05 | - +---------------------+---------------------+ - - -PERIOD_ADD ----------- - -Description ->>>>>>>>>>> - -Usage: period_add(P, N) add N months to period P (in the format YYMM or YYYYMM). Returns a value in the format YYYYMM. - -Argument type: INTEGER, INTEGER - -Return type: INTEGER - -Example:: - - os> source=people | eval `PERIOD_ADD(200801, 2)` = PERIOD_ADD(200801, 2), `PERIOD_ADD(200801, -12)` = PERIOD_ADD(200801, -12) | fields `PERIOD_ADD(200801, 2)`, `PERIOD_ADD(200801, -12)` - fetched rows / total rows = 1/1 - +-----------------------+-------------------------+ - | PERIOD_ADD(200801, 2) | PERIOD_ADD(200801, -12) | - |-----------------------+-------------------------| - | 200803 | 200701 | - +-----------------------+-------------------------+ - - -PERIOD_DIFF ------------ - -Description ->>>>>>>>>>> - -Usage: period_diff(P1, P2) returns the number of months between periods P1 and P2 given in the format YYMM or YYYYMM. - -Argument type: INTEGER, INTEGER - -Return type: INTEGER - -Example:: - - os> source=people | eval `PERIOD_DIFF(200802, 200703)` = PERIOD_DIFF(200802, 200703), `PERIOD_DIFF(200802, 201003)` = PERIOD_DIFF(200802, 201003) | fields `PERIOD_DIFF(200802, 200703)`, `PERIOD_DIFF(200802, 201003)` - fetched rows / total rows = 1/1 - +-----------------------------+-----------------------------+ - | PERIOD_DIFF(200802, 200703) | PERIOD_DIFF(200802, 201003) | - |-----------------------------+-----------------------------| - | 11 | -25 | - +-----------------------------+-----------------------------+ - - -QUARTER -------- - -Description ->>>>>>>>>>> - -Usage: quarter(date) returns the quarter of the year for date, in the range 1 to 4. - -Argument type: STRING/DATE/TIMESTAMP - -Return type: INTEGER - -Example:: - - os> source=people | eval `QUARTER(DATE('2020-08-26'))` = QUARTER(DATE('2020-08-26')) | fields `QUARTER(DATE('2020-08-26'))` - fetched rows / total rows = 1/1 - +-----------------------------+ - | QUARTER(DATE('2020-08-26')) | - |-----------------------------| - | 3 | - +-----------------------------+ - - -SEC_TO_TIME ------------ - -Description ->>>>>>>>>>> - -Usage: sec_to_time(number) returns the time in HH:mm:ssss[.nnnnnn] format. -Note that the function returns a time between 00:00:00 and 23:59:59. -If an input value is too large (greater than 86399), the function will wrap around and begin returning outputs starting from 00:00:00. -If an input value is too small (less than 0), the function will wrap around and begin returning outputs counting down from 23:59:59. - -Argument type: INTEGER, LONG, DOUBLE, FLOAT - -Return type: TIME - -Example:: - - os> source=people | eval `SEC_TO_TIME(3601)` = SEC_TO_TIME(3601) | eval `SEC_TO_TIME(1234.123)` = SEC_TO_TIME(1234.123) | fields `SEC_TO_TIME(3601)`, `SEC_TO_TIME(1234.123)` - fetched rows / total rows = 1/1 - +-------------------+-----------------------+ - | SEC_TO_TIME(3601) | SEC_TO_TIME(1234.123) | - |-------------------+-----------------------| - | 01:00:01 | 00:20:34.123 | - +-------------------+-----------------------+ - - -SECOND ------- - -Description ->>>>>>>>>>> - -Usage: second(time) returns the second for time, in the range 0 to 59. - -Argument type: STRING/TIME/TIMESTAMP - -Return type: INTEGER - -Synonyms: `SECOND_OF_MINUTE`_ - -Example:: - - os> source=people | eval `SECOND(TIME('01:02:03'))` = SECOND(TIME('01:02:03')) | fields `SECOND(TIME('01:02:03'))` - fetched rows / total rows = 1/1 - +--------------------------+ - | SECOND(TIME('01:02:03')) | - |--------------------------| - | 3 | - +--------------------------+ - - -SECOND_OF_MINUTE ----------------- - -Description ->>>>>>>>>>> - -Usage: second_of_minute(time) returns the second for time, in the range 0 to 59. - -Argument type: STRING/TIME/TIMESTAMP - -Return type: INTEGER - -Synonyms: `SECOND`_ - -Example:: - - os> source=people | eval `SECOND_OF_MINUTE(TIME('01:02:03'))` = SECOND_OF_MINUTE(TIME('01:02:03')) | fields `SECOND_OF_MINUTE(TIME('01:02:03'))` - fetched rows / total rows = 1/1 - +------------------------------------+ - | SECOND_OF_MINUTE(TIME('01:02:03')) | - |------------------------------------| - | 3 | - +------------------------------------+ - - -STRFTIME --------- - -**Version: 3.3.0** - -Description ->>>>>>>>>>> - -Usage: strftime(time, format) takes a UNIX timestamp (in seconds) and renders it as a string using the format specified. For numeric inputs, the UNIX time must be in seconds. Values greater than 100000000000 are automatically treated as milliseconds and converted to seconds. -You can use time format variables with the strftime function. This function performs the reverse operation of `UNIX_TIMESTAMP`_ and is similar to `FROM_UNIXTIME`_ but with POSIX-style format specifiers. - -.. note:: - - **Available only when Calcite engine is enabled** - - All timestamps are interpreted as UTC timezone - - Text formatting uses language-neutral Locale.ROOT (weekday and month names appear in abbreviated form) - - String inputs are NOT supported - use `unix_timestamp()` to convert strings first - - Functions that return date/time values (like `date()`, `now()`, `timestamp()`) are supported - -Argument type: INTEGER/LONG/DOUBLE/TIMESTAMP, STRING - -Return type: STRING - -Format specifiers: - -.. list-table:: The following table describes the available specifier arguments. - :widths: 20 80 - :header-rows: 1 - - * - Specifier - - Description - * - %a - - Abbreviated weekday name (Mon..Sun) - * - %A - - Weekday name (Mon..Sun) - Note: Locale.ROOT uses abbreviated form - * - %b - - Abbreviated month name (Jan..Dec) - * - %B - - Month name (Jan..Dec) - Note: Locale.ROOT uses abbreviated form - * - %c - - Date and time (e.g., Mon Jul 18 09:30:00 2019) - * - %C - - Century as 2-digit decimal number - * - %d - - Day of the month, zero-padded (01..31) - * - %e - - Day of the month, space-padded ( 1..31) - * - %Ez - - Timezone offset in minutes from UTC (e.g., +0 for UTC, +330 for IST, -300 for EST) - * - %f - - Microseconds as decimal number (000000..999999) - * - %F - - ISO 8601 date format (%Y-%m-%d) - * - %g - - ISO 8601 year without century (00..99) - * - %G - - ISO 8601 year with century - * - %H - - Hour (24-hour clock) (00..23) - * - %I - - Hour (12-hour clock) (01..12) - * - %j - - Day of year (001..366) - * - %k - - Hour (24-hour clock), space-padded ( 0..23) - * - %m - - Month as decimal number (01..12) - * - %M - - Minute (00..59) - * - %N - - Subsecond digits (default %9N = nanoseconds). Accepts any precision value from 1-9 (e.g., %3N = 3 digits, %5N = 5 digits, %9N = 9 digits). The precision directly controls the number of digits displayed - * - %p - - AM or PM - * - %Q - - Subsecond component (default milliseconds). Can specify precision: %3Q = milliseconds, %6Q = microseconds, %9Q = nanoseconds. Other precision values (e.g., %5Q) default to %3Q - * - %s - - UNIX Epoch timestamp in seconds - * - %S - - Second (00..59) - * - %T - - Time in 24-hour notation (%H:%M:%S) - * - %U - - Week of year starting from 0 (00..53) - * - %V - - ISO week number (01..53) - * - %w - - Weekday as decimal (0=Sunday..6=Saturday) - * - %x - - Date in MM/dd/yyyy format (e.g., 07/13/2019) - * - %X - - Time in HH:mm:ss format (e.g., 09:30:00) - * - %y - - Year without century (00..99) - * - %Y - - Year with century - * - %z - - Timezone offset (+hhmm or -hhmm) - * - %:z - - Timezone offset with colon (+hh:mm or -hh:mm) - * - %::z - - Timezone offset with colons (+hh:mm:ss) - * - %:::z - - Timezone offset hour only (+hh or -hh) - * - %Z - - Timezone abbreviation (e.g., EST, PDT) - * - %% - - Literal % character - -Examples:: - - #os> source=people | eval `strftime(1521467703, "%Y-%m-%dT%H:%M:%S")` = strftime(1521467703, "%Y-%m-%dT%H:%M:%S") | fields `strftime(1521467703, "%Y-%m-%dT%H:%M:%S")` - fetched rows / total rows = 1/1 - +-------------------------------------------+ - | strftime(1521467703, "%Y-%m-%dT%H:%M:%S") | - |-------------------------------------------| - | 2018-03-19T13:55:03 | - +-------------------------------------------+ - - #os> source=people | eval `strftime(1521467703, "%F %T")` = strftime(1521467703, "%F %T") | fields `strftime(1521467703, "%F %T")` - fetched rows / total rows = 1/1 - +-------------------------------+ - | strftime(1521467703, "%F %T") | - |-------------------------------| - | 2018-03-19 13:55:03 | - +-------------------------------+ - - #os> source=people | eval `strftime(1521467703, "%a %b %d, %Y")` = strftime(1521467703, "%a %b %d, %Y") | fields `strftime(1521467703, "%a %b %d, %Y")` - fetched rows / total rows = 1/1 - +--------------------------------------+ - | strftime(1521467703, "%a %b %d, %Y") | - |--------------------------------------| - | Mon Mar 19, 2018 | - +--------------------------------------+ - - #os> source=people | eval `strftime(1521467703, "%%Y")` = strftime(1521467703, "%%Y") | fields `strftime(1521467703, "%%Y")` - fetched rows / total rows = 1/1 - +---------------------------+ - | strftime(1521467703, "%%Y") | - |---------------------------| - | %Y | - +---------------------------+ - - #os> source=people | eval `strftime(date('2020-09-16'), "%Y-%m-%d")` = strftime(date('2020-09-16'), "%Y-%m-%d") | fields `strftime(date('2020-09-16'), "%Y-%m-%d")` - fetched rows / total rows = 1/1 - +----------------------------------------+ - | strftime(date('2020-09-16'), "%Y-%m-%d") | - |-----------------------------------------| - | 2020-09-16 | - +----------------------------------------+ - - #os> source=people | eval `strftime(timestamp('2020-09-16 14:30:00'), "%F %T")` = strftime(timestamp('2020-09-16 14:30:00'), "%F %T") | fields `strftime(timestamp('2020-09-16 14:30:00'), "%F %T")` - fetched rows / total rows = 1/1 - +--------------------------------------------------+ - | strftime(timestamp('2020-09-16 14:30:00'), "%F %T") | - |---------------------------------------------------| - | 2020-09-16 14:30:00 | - +--------------------------------------------------+ - - #os> source=people | eval `strftime(now(), "%Y-%m-%d %H:%M:%S")` = strftime(now(), "%Y-%m-%d %H:%M:%S") | fields `strftime(now(), "%Y-%m-%d %H:%M:%S")` - fetched rows / total rows = 1/1 - +------------------------------------+ - | strftime(now(), "%Y-%m-%d %H:%M:%S") | - |-------------------------------------| - | 2025-09-03 12:30:45 | - +------------------------------------+ - - -STR_TO_DATE ------------ - -Description ->>>>>>>>>>> - -Usage: str_to_date(string, string) is used to extract a TIMESTAMP from the first argument string using the formats specified in the second argument string. -The input argument must have enough information to be parsed as a DATE, TIMESTAMP, or TIME. -Acceptable string format specifiers are the same as those used in the `DATE_FORMAT`_ function. -It returns NULL when a statement cannot be parsed due to an invalid pair of arguments, and when 0 is provided for any DATE field. Otherwise, it will return a TIMESTAMP with the parsed values (as well as default values for any field that was not parsed). - -Argument type: STRING, STRING - -Return type: TIMESTAMP - -Example:: - - OS> source=people | eval `str_to_date("01,5,2013", "%d,%m,%Y")` = str_to_date("01,5,2013", "%d,%m,%Y") | fields = `str_to_date("01,5,2013", "%d,%m,%Y")` - fetched rows / total rows = 1/1 - +--------------------------------------+ - | str_to_date("01,5,2013", "%d,%m,%Y") | - |--------------------------------------| - | 2013-05-01 00:00:00 | - +--------------------------------------+ - - -SUBDATE -------- - -Description ->>>>>>>>>>> - -Usage: subdate(date, INTERVAL expr unit) / subdate(date, days) subtracts the interval expr from date; subdate(date, days) subtracts the second argument as integer number of days from date. -If first argument is TIME, today's date is used; if first argument is DATE, time at midnight is used. - -Argument type: DATE/TIMESTAMP/TIME, INTERVAL/LONG - -Return type map: - -(DATE/TIMESTAMP/TIME, INTERVAL) -> TIMESTAMP - -(DATE, LONG) -> DATE - -(TIMESTAMP/TIME, LONG) -> TIMESTAMP - -Synonyms: `DATE_SUB`_ when invoked with the INTERVAL form of the second argument. - -Antonyms: `ADDDATE`_ - -Example:: - - os> source=people | eval `'2008-01-02' - 31d` = SUBDATE(DATE('2008-01-02'), INTERVAL 31 DAY), `'2020-08-26' - 1` = SUBDATE(DATE('2020-08-26'), 1), `ts '2020-08-26 01:01:01' - 1` = SUBDATE(TIMESTAMP('2020-08-26 01:01:01'), 1) | fields `'2008-01-02' - 31d`, `'2020-08-26' - 1`, `ts '2020-08-26 01:01:01' - 1` - fetched rows / total rows = 1/1 - +---------------------+------------------+------------------------------+ - | '2008-01-02' - 31d | '2020-08-26' - 1 | ts '2020-08-26 01:01:01' - 1 | - |---------------------+------------------+------------------------------| - | 2007-12-02 00:00:00 | 2020-08-25 | 2020-08-25 01:01:01 | - +---------------------+------------------+------------------------------+ - - -SUBTIME -------- - -Description ->>>>>>>>>>> - -Usage: subtime(expr1, expr2) subtracts expr2 from expr1 and returns the result. If argument is TIME, today's date is used; if argument is DATE, time at midnight is used. - -Argument type: DATE/TIMESTAMP/TIME, DATE/TIMESTAMP/TIME - -Return type map: - -(DATE/TIMESTAMP, DATE/TIMESTAMP/TIME) -> TIMESTAMP - -(TIME, DATE/TIMESTAMP/TIME) -> TIME - -Antonyms: `ADDTIME`_ - -Example:: - - os> source=people | eval `'2008-12-12' - 0` = SUBTIME(DATE('2008-12-12'), DATE('2008-11-15')) | fields `'2008-12-12' - 0` - fetched rows / total rows = 1/1 - +---------------------+ - | '2008-12-12' - 0 | - |---------------------| - | 2008-12-12 00:00:00 | - +---------------------+ - - os> source=people | eval `'23:59:59' - 0` = SUBTIME(TIME('23:59:59'), DATE('2004-01-01')) | fields `'23:59:59' - 0` - fetched rows / total rows = 1/1 - +----------------+ - | '23:59:59' - 0 | - |----------------| - | 23:59:59 | - +----------------+ - - os> source=people | eval `'2004-01-01' - '23:59:59'` = SUBTIME(DATE('2004-01-01'), TIME('23:59:59')) | fields `'2004-01-01' - '23:59:59'` - fetched rows / total rows = 1/1 - +---------------------------+ - | '2004-01-01' - '23:59:59' | - |---------------------------| - | 2003-12-31 00:00:01 | - +---------------------------+ - - os> source=people | eval `'10:20:30' - '00:05:42'` = SUBTIME(TIME('10:20:30'), TIME('00:05:42')) | fields `'10:20:30' - '00:05:42'` - fetched rows / total rows = 1/1 - +-------------------------+ - | '10:20:30' - '00:05:42' | - |-------------------------| - | 10:14:48 | - +-------------------------+ - - os> source=people | eval `'2007-03-01 10:20:30' - '20:40:50'` = SUBTIME(TIMESTAMP('2007-03-01 10:20:30'), TIMESTAMP('2002-03-04 20:40:50')) | fields `'2007-03-01 10:20:30' - '20:40:50'` - fetched rows / total rows = 1/1 - +------------------------------------+ - | '2007-03-01 10:20:30' - '20:40:50' | - |------------------------------------| - | 2007-02-28 13:39:40 | - +------------------------------------+ - - -SYSDATE -------- - -Description ->>>>>>>>>>> - -Returns the current date and time as a value in 'YYYY-MM-DD hh:mm:ss[.nnnnnn]'. -SYSDATE() returns the date and time at which it executes in UTC. This differs from the behavior for `NOW() <#now>`_, which returns a constant time that indicates the time at which the statement began to execute. -If an argument is given, it specifies a fractional seconds precision from 0 to 6, the return value includes a fractional seconds part of that many digits. - -Optional argument type: INTEGER - -Return type: TIMESTAMP - -Specification: SYSDATE([INTEGER]) -> TIMESTAMP - -Example:: - - > source=people | eval `value_1` = SYSDATE(), `value_2` = SYSDATE(6) | fields `value_1`, `value_2` - fetched rows / total rows = 1/1 - +---------------------+----------------------------+ - | value_1 | value_2 | - |---------------------+----------------------------| - | 2022-08-02 15:39:05 | 2022-08-02 15:39:05.123456 | - +---------------------+----------------------------+ - - -TIME ----- - -Description ->>>>>>>>>>> - -Usage: time(expr) constructs a time type with the input string expr as a time. If the argument is of date/time/timestamp, it extracts the time value part from the expression. - -Argument type: STRING/DATE/TIME/TIMESTAMP - -Return type: TIME - -Example:: - - os> source=people | eval `TIME('13:49:00')` = TIME('13:49:00') | fields `TIME('13:49:00')` - fetched rows / total rows = 1/1 - +------------------+ - | TIME('13:49:00') | - |------------------| - | 13:49:00 | - +------------------+ - - os> source=people | eval `TIME('13:49')` = TIME('13:49') | fields `TIME('13:49')` - fetched rows / total rows = 1/1 - +---------------+ - | TIME('13:49') | - |---------------| - | 13:49:00 | - +---------------+ - - os> source=people | eval `TIME('2020-08-26 13:49:00')` = TIME('2020-08-26 13:49:00') | fields `TIME('2020-08-26 13:49:00')` - fetched rows / total rows = 1/1 - +-----------------------------+ - | TIME('2020-08-26 13:49:00') | - |-----------------------------| - | 13:49:00 | - +-----------------------------+ - - os> source=people | eval `TIME('2020-08-26 13:49')` = TIME('2020-08-26 13:49') | fields `TIME('2020-08-26 13:49')` - fetched rows / total rows = 1/1 - +--------------------------+ - | TIME('2020-08-26 13:49') | - |--------------------------| - | 13:49:00 | - +--------------------------+ - - -TIME_FORMAT ------------ - -Description ->>>>>>>>>>> - -Usage: time_format(time, format) formats the time argument using the specifiers in the format argument. -This supports a subset of the time format specifiers available for the `date_format`_ function. -Using date format specifiers supported by `date_format`_ will return 0 or null. -Acceptable format specifiers are listed in the table below. -If an argument of type DATE is passed in, it is treated as a TIMESTAMP at midnight (i.e., 00:00:00). - -.. list-table:: The following table describes the available specifier arguments. - :widths: 20 80 - :header-rows: 1 - - * - Specifier - - Description - * - %f - - Microseconds (000000..999999) - * - %H - - Hour (00..23) - * - %h - - Hour (01..12) - * - %I - - Hour (01..12) - * - %i - - Minutes, numeric (00..59) - * - %p - - AM or PM - * - %r - - Time, 12-hour (hh:mm:ss followed by AM or PM) - * - %S - - Seconds (00..59) - * - %s - - Seconds (00..59) - * - %T - - Time, 24-hour (hh:mm:ss) - - -Argument type: STRING/DATE/TIME/TIMESTAMP, STRING - -Return type: STRING - -Example:: - - os> source=people | eval `TIME_FORMAT('1998-01-31 13:14:15.012345', '%f %H %h %I %i %p %r %S %s %T')` = TIME_FORMAT('1998-01-31 13:14:15.012345', '%f %H %h %I %i %p %r %S %s %T') | fields `TIME_FORMAT('1998-01-31 13:14:15.012345', '%f %H %h %I %i %p %r %S %s %T')` - fetched rows / total rows = 1/1 - +----------------------------------------------------------------------------+ - | TIME_FORMAT('1998-01-31 13:14:15.012345', '%f %H %h %I %i %p %r %S %s %T') | - |----------------------------------------------------------------------------| - | 012345 13 01 01 14 PM 01:14:15 PM 15 15 13:14:15 | - +----------------------------------------------------------------------------+ - - -TIME_TO_SEC ------------ - -Description ->>>>>>>>>>> - -Usage: time_to_sec(time) returns the time argument, converted to seconds. - -Argument type: STRING/TIME/TIMESTAMP - -Return type: LONG - -Example:: - - os> source=people | eval `TIME_TO_SEC(TIME('22:23:00'))` = TIME_TO_SEC(TIME('22:23:00')) | fields `TIME_TO_SEC(TIME('22:23:00'))` - fetched rows / total rows = 1/1 - +-------------------------------+ - | TIME_TO_SEC(TIME('22:23:00')) | - |-------------------------------| - | 80580 | - +-------------------------------+ - - -TIMEDIFF --------- - -Description ->>>>>>>>>>> - -Usage: returns the difference between two time expressions as a time. - -Argument type: TIME, TIME - -Return type: TIME - -Example:: - - os> source=people | eval `TIMEDIFF('23:59:59', '13:00:00')` = TIMEDIFF('23:59:59', '13:00:00') | fields `TIMEDIFF('23:59:59', '13:00:00')` - fetched rows / total rows = 1/1 - +----------------------------------+ - | TIMEDIFF('23:59:59', '13:00:00') | - |----------------------------------| - | 10:59:59 | - +----------------------------------+ - - -TIMESTAMP ---------- - -Description ->>>>>>>>>>> - -Usage: timestamp(expr) constructs a timestamp type with the input string `expr` as an timestamp. If the argument is not a string, it casts `expr` to timestamp type with default timezone UTC. If argument is a time, it applies today's date before cast. -With two arguments `timestamp(expr1, expr2)` adds the time expression `expr2` to the date or timestamp expression `expr1` and returns the result as a timestamp value. - -Argument type: STRING/DATE/TIME/TIMESTAMP - -Return type map: - -(STRING/DATE/TIME/TIMESTAMP) -> TIMESTAMP - -(STRING/DATE/TIME/TIMESTAMP, STRING/DATE/TIME/TIMESTAMP) -> TIMESTAMP - -Example:: - - os> source=people | eval `TIMESTAMP('2020-08-26 13:49:00')` = TIMESTAMP('2020-08-26 13:49:00'), `TIMESTAMP('2020-08-26 13:49:00', TIME('12:15:42'))` = TIMESTAMP('2020-08-26 13:49:00', TIME('12:15:42')) | fields `TIMESTAMP('2020-08-26 13:49:00')`, `TIMESTAMP('2020-08-26 13:49:00', TIME('12:15:42'))` - fetched rows / total rows = 1/1 - +----------------------------------+----------------------------------------------------+ - | TIMESTAMP('2020-08-26 13:49:00') | TIMESTAMP('2020-08-26 13:49:00', TIME('12:15:42')) | - |----------------------------------+----------------------------------------------------| - | 2020-08-26 13:49:00 | 2020-08-27 02:04:42 | - +----------------------------------+----------------------------------------------------+ - - -TIMESTAMPADD ------------- - -Description ->>>>>>>>>>> - -Usage: Returns a TIMESTAMP value based on a passed in DATE/TIME/TIMESTAMP/STRING argument and an INTERVAL and INTEGER argument which determine the amount of time to be added. -If the third argument is a STRING, it must be formatted as a valid TIMESTAMP. If only a TIME is provided, a TIMESTAMP is still returned with the DATE portion filled in using the current date. -If the third argument is a DATE, it will be automatically converted to a TIMESTAMP. - -Argument type: INTERVAL, INTEGER, DATE/TIME/TIMESTAMP/STRING - -INTERVAL must be one of the following tokens: [MICROSECOND, SECOND, MINUTE, HOUR, DAY, WEEK, MONTH, QUARTER, YEAR] - -Examples:: - - os> source=people | eval `TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00')` = TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00') | eval `TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00')` = TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00') | fields `TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00')`, `TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00')` - fetched rows / total rows = 1/1 - +----------------------------------------------+--------------------------------------------------+ - | TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00') | TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00') | - |----------------------------------------------+--------------------------------------------------| - | 2000-01-18 00:00:00 | 1999-10-01 00:00:00 | - +----------------------------------------------+--------------------------------------------------+ - - -TIMESTAMPDIFF -------------- - -Description ->>>>>>>>>>> - -Usage: TIMESTAMPDIFF(interval, start, end) returns the difference between the start and end date/times in interval units. -If a TIME is provided as an argument, it will be converted to a TIMESTAMP with the DATE portion filled in using the current date. -Arguments will be automatically converted to a TIME/TIMESTAMP when appropriate. -Any argument that is a STRING must be formatted as a valid TIMESTAMP. - -Argument type: INTERVAL, DATE/TIME/TIMESTAMP/STRING, DATE/TIME/TIMESTAMP/STRING - -INTERVAL must be one of the following tokens: [MICROSECOND, SECOND, MINUTE, HOUR, DAY, WEEK, MONTH, QUARTER, YEAR] - -Examples:: - - os> source=people | eval `TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00')` = TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00') | eval `TIMESTAMPDIFF(SECOND, time('00:00:23'), time('00:00:00'))` = TIMESTAMPDIFF(SECOND, time('00:00:23'), time('00:00:00')) | fields `TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00')`, `TIMESTAMPDIFF(SECOND, time('00:00:23'), time('00:00:00'))` - fetched rows / total rows = 1/1 - +-------------------------------------------------------------------+-----------------------------------------------------------+ - | TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00') | TIMESTAMPDIFF(SECOND, time('00:00:23'), time('00:00:00')) | - |-------------------------------------------------------------------+-----------------------------------------------------------| - | 4 | -23 | - +-------------------------------------------------------------------+-----------------------------------------------------------+ - - -TO_DAYS -------- - -Description ->>>>>>>>>>> - -Usage: to_days(date) returns the day number (the number of days since year 0) of the given date. Returns NULL if date is invalid. - -Argument type: STRING/DATE/TIMESTAMP - -Return type: LONG - -Example:: - - os> source=people | eval `TO_DAYS(DATE('2008-10-07'))` = TO_DAYS(DATE('2008-10-07')) | fields `TO_DAYS(DATE('2008-10-07'))` - fetched rows / total rows = 1/1 - +-----------------------------+ - | TO_DAYS(DATE('2008-10-07')) | - |-----------------------------| - | 733687 | - +-----------------------------+ - - -TO_SECONDS ----------- - -Description ->>>>>>>>>>> - -Usage: to_seconds(date) returns the number of seconds since the year 0 of the given value. Returns NULL if value is invalid. -An argument of a LONG type can be used. It must be formatted as YMMDD, YYMMDD, YYYMMDD or YYYYMMDD. Note that a LONG type argument cannot have leading 0s as it will be parsed using an octal numbering system. - -Argument type: STRING/LONG/DATE/TIME/TIMESTAMP - -Return type: LONG - -Example:: - - os> source=people | eval `TO_SECONDS(DATE('2008-10-07'))` = TO_SECONDS(DATE('2008-10-07')) | eval `TO_SECONDS(950228)` = TO_SECONDS(950228) | fields `TO_SECONDS(DATE('2008-10-07'))`, `TO_SECONDS(950228)` - fetched rows / total rows = 1/1 - +--------------------------------+--------------------+ - | TO_SECONDS(DATE('2008-10-07')) | TO_SECONDS(950228) | - |--------------------------------+--------------------| - | 63390556800 | 62961148800 | - +--------------------------------+--------------------+ - - -UNIX_TIMESTAMP --------------- - -Description ->>>>>>>>>>> - -Usage: Converts given argument to Unix time (seconds since Epoch - very beginning of year 1970). If no argument given, it returns the current Unix time. -The date argument may be a DATE, or TIMESTAMP string, or a number in YYMMDD, YYMMDDhhmmss, YYYYMMDD, or YYYYMMDDhhmmss format. If the argument includes a time part, it may optionally include a fractional seconds part. -If argument is in invalid format or outside of range 1970-01-01 00:00:00 - 3001-01-18 23:59:59.999999 (0 to 32536771199.999999 epoch time), function returns NULL. -You can use `FROM_UNIXTIME`_ to do reverse conversion. - -Argument type: /DOUBLE/DATE/TIMESTAMP - -Return type: DOUBLE - -Example:: - - os> source=people | eval `UNIX_TIMESTAMP(double)` = UNIX_TIMESTAMP(20771122143845), `UNIX_TIMESTAMP(timestamp)` = UNIX_TIMESTAMP(TIMESTAMP('1996-11-15 17:05:42')) | fields `UNIX_TIMESTAMP(double)`, `UNIX_TIMESTAMP(timestamp)` - fetched rows / total rows = 1/1 - +------------------------+---------------------------+ - | UNIX_TIMESTAMP(double) | UNIX_TIMESTAMP(timestamp) | - |------------------------+---------------------------| - | 3404817525.0 | 848077542.0 | - +------------------------+---------------------------+ - - -UTC_DATE --------- - -Description ->>>>>>>>>>> - -Returns the current UTC date as a value in 'YYYY-MM-DD'. - -Return type: DATE - -Specification: UTC_DATE() -> DATE - -Example:: - - > source=people | eval `UTC_DATE()` = UTC_DATE() | fields `UTC_DATE()` - fetched rows / total rows = 1/1 - +------------+ - | UTC_DATE() | - |------------| - | 2022-10-03 | - +------------+ - - -UTC_TIME --------- - -Description ->>>>>>>>>>> - -Returns the current UTC time as a value in 'hh:mm:ss'. - -Return type: TIME - -Specification: UTC_TIME() -> TIME - -Example:: - - > source=people | eval `UTC_TIME()` = UTC_TIME() | fields `UTC_TIME()` - fetched rows / total rows = 1/1 - +------------+ - | UTC_TIME() | - |------------| - | 17:54:27 | - +------------+ - - -UTC_TIMESTAMP -------------- - -Description ->>>>>>>>>>> - -Returns the current UTC timestamp as a value in 'YYYY-MM-DD hh:mm:ss'. - -Return type: TIMESTAMP - -Specification: UTC_TIMESTAMP() -> TIMESTAMP - -Example:: - - > source=people | eval `UTC_TIMESTAMP()` = UTC_TIMESTAMP() | fields `UTC_TIMESTAMP()` - fetched rows / total rows = 1/1 - +---------------------+ - | UTC_TIMESTAMP() | - |---------------------| - | 2022-10-03 17:54:28 | - +---------------------+ - - -WEEK ----- - -Description ->>>>>>>>>>> - -Usage: week(date[, mode]) returns the week number for date. If the mode argument is omitted, the default mode 0 is used. - -.. list-table:: The following table describes how the mode argument works. - :widths: 25 50 25 75 - :header-rows: 1 - - * - Mode - - First day of week - - Range - - Week 1 is the first week ... - * - 0 - - Sunday - - 0-53 - - with a Sunday in this year - * - 1 - - Monday - - 0-53 - - with 4 or more days this year - * - 2 - - Sunday - - 1-53 - - with a Sunday in this year - * - 3 - - Monday - - 1-53 - - with 4 or more days this year - * - 4 - - Sunday - - 0-53 - - with 4 or more days this year - * - 5 - - Monday - - 0-53 - - with a Monday in this year - * - 6 - - Sunday - - 1-53 - - with 4 or more days this year - * - 7 - - Monday - - 1-53 - - with a Monday in this year - -Argument type: DATE/TIMESTAMP/STRING - -Return type: INTEGER - -Synonyms: `WEEK_OF_YEAR`_ - -Example:: - - os> source=people | eval `WEEK(DATE('2008-02-20'))` = WEEK(DATE('2008-02-20')), `WEEK(DATE('2008-02-20'), 1)` = WEEK(DATE('2008-02-20'), 1) | fields `WEEK(DATE('2008-02-20'))`, `WEEK(DATE('2008-02-20'), 1)` - fetched rows / total rows = 1/1 - +--------------------------+-----------------------------+ - | WEEK(DATE('2008-02-20')) | WEEK(DATE('2008-02-20'), 1) | - |--------------------------+-----------------------------| - | 7 | 8 | - +--------------------------+-----------------------------+ - - -WEEKDAY -------- - -Description ->>>>>>>>>>> - -Usage: weekday(date) returns the weekday index for date (0 = Monday, 1 = Tuesday, ..., 6 = Sunday). - -It is similar to the `dayofweek`_ function, but returns different indexes for each day. - -Argument type: STRING/DATE/TIME/TIMESTAMP - -Return type: INTEGER - -Example:: - - os> source=people | eval `weekday(DATE('2020-08-26'))` = weekday(DATE('2020-08-26')) | eval `weekday(DATE('2020-08-27'))` = weekday(DATE('2020-08-27')) | fields `weekday(DATE('2020-08-26'))`, `weekday(DATE('2020-08-27'))` - fetched rows / total rows = 1/1 - +-----------------------------+-----------------------------+ - | weekday(DATE('2020-08-26')) | weekday(DATE('2020-08-27')) | - |-----------------------------+-----------------------------| - | 2 | 3 | - +-----------------------------+-----------------------------+ - - -WEEK_OF_YEAR ------------- - -Description ->>>>>>>>>>> - -Usage: week_of_year(date[, mode]) returns the week number for date. If the mode argument is omitted, the default mode 0 is used. - -.. list-table:: The following table describes how the mode argument works. - :widths: 25 50 25 75 - :header-rows: 1 - - * - Mode - - First day of week - - Range - - Week 1 is the first week ... - * - 0 - - Sunday - - 0-53 - - with a Sunday in this year - * - 1 - - Monday - - 0-53 - - with 4 or more days this year - * - 2 - - Sunday - - 1-53 - - with a Sunday in this year - * - 3 - - Monday - - 1-53 - - with 4 or more days this year - * - 4 - - Sunday - - 0-53 - - with 4 or more days this year - * - 5 - - Monday - - 0-53 - - with a Monday in this year - * - 6 - - Sunday - - 1-53 - - with 4 or more days this year - * - 7 - - Monday - - 1-53 - - with a Monday in this year - -Argument type: DATE/TIMESTAMP/STRING - -Return type: INTEGER - -Synonyms: `WEEK`_ - -Example:: - - os> source=people | eval `WEEK_OF_YEAR(DATE('2008-02-20'))` = WEEK(DATE('2008-02-20')), `WEEK_OF_YEAR(DATE('2008-02-20'), 1)` = WEEK_OF_YEAR(DATE('2008-02-20'), 1) | fields `WEEK_OF_YEAR(DATE('2008-02-20'))`, `WEEK_OF_YEAR(DATE('2008-02-20'), 1)` - fetched rows / total rows = 1/1 - +----------------------------------+-------------------------------------+ - | WEEK_OF_YEAR(DATE('2008-02-20')) | WEEK_OF_YEAR(DATE('2008-02-20'), 1) | - |----------------------------------+-------------------------------------| - | 7 | 8 | - +----------------------------------+-------------------------------------+ - - -YEAR ----- - -Description ->>>>>>>>>>> - -Usage: year(date) returns the year for date, in the range 1000 to 9999, or 0 for the “zero” date. - -Argument type: STRING/DATE/TIMESTAMP - -Return type: INTEGER - -Example:: - - os> source=people | eval `YEAR(DATE('2020-08-26'))` = YEAR(DATE('2020-08-26')) | fields `YEAR(DATE('2020-08-26'))` - fetched rows / total rows = 1/1 - +--------------------------+ - | YEAR(DATE('2020-08-26')) | - |--------------------------| - | 2020 | - +--------------------------+ - - -YEARWEEK --------- - -Description ->>>>>>>>>>> - -Usage: yearweek(date[, mode]) returns the year and week for date as an integer. It accepts and optional mode arguments aligned with those available for the `WEEK`_ function. - -Argument type: STRING/DATE/TIME/TIMESTAMP - -Return type: INTEGER - -Example:: - - os> source=people | eval `YEARWEEK('2020-08-26')` = YEARWEEK('2020-08-26') | eval `YEARWEEK('2019-01-05', 1)` = YEARWEEK('2019-01-05', 1) | fields `YEARWEEK('2020-08-26')`, `YEARWEEK('2019-01-05', 1)` - fetched rows / total rows = 1/1 - +------------------------+---------------------------+ - | YEARWEEK('2020-08-26') | YEARWEEK('2019-01-05', 1) | - |------------------------+---------------------------| - | 202034 | 201901 | - +------------------------+---------------------------+ - diff --git a/docs/user/ppl/functions/expressions.md b/docs/user/ppl/functions/expressions.md new file mode 100644 index 00000000000..999531cabbe --- /dev/null +++ b/docs/user/ppl/functions/expressions.md @@ -0,0 +1,185 @@ +# Expressions + +## Introduction + +Expressions, particularly value expressions, are those which return a scalar value. Expressions have different types and forms. For example, there are literal values as atom expression and arithmetic, predicate and function expression built on top of them. And also expressions can be used in different clauses, such as using arithmetic expression in `Filter`, `Stats` command. +## Arithmetic Operators + +### Description + +#### Operators + +Arithmetic expression is an expression formed by numeric literals and binary arithmetic operators as follows: +1. `+`: Add. +2. `-`: Subtract. +3. `*`: Multiply. +4. `/`: Divide. Integer operands follow the legacy truncating result when + + [plugins.ppl.syntax.legacy.preferred](../admin/settings.md) is `true` (default). When the + setting is `false` the operands are promoted to floating point, preserving + the fractional part. Division by zero still returns `NULL`. +5. `%`: Modulo. This can be used with integers only with remainder of the division as result. + +#### Precedence + +Parentheses can be used to control the precedence of arithmetic operators. Otherwise, operators of higher precedence is performed first. +#### Type Conversion + +Implicit type conversion is performed when looking up operator signature. For example, an integer `+` a real number matches signature `+(double,double)` which results in a real number. This rule also applies to function call discussed below. +### Examples + +Here is an example for different type of arithmetic expressions + +```ppl +source=accounts +| where age > (25 + 5) +| fields age +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++-----+ +| age | +|-----| +| 32 | +| 36 | +| 33 | ++-----+ +``` + +## Predicate Operators + +### Description + +Predicate operator is an expression that evaluated to be ture. The MISSING and NULL value comparison has following the rule. MISSING value only equal to MISSING value and less than all the other values. NULL value equals to NULL value, large than MISSING value, but less than all the other values. +#### Operators + +| name | description | +| --- | --- | +| > | Greater than operator | +| >= | Greater than or equal operator | +| < | Less than operator | +| != | Not equal operator | +| <= | Less than or equal operator | +| = | Equal operator | +| == | Equal operator (alternative syntax) | +| LIKE | Simple Pattern matching | +| IN | NULL value test | +| AND | AND operator | +| OR | OR operator | +| XOR | XOR operator | +| NOT | NOT NULL value test | + +It is possible to compare datetimes. When comparing different datetime types, for example `DATE` and `TIME`, both converted to `DATETIME`. +The following rule is applied on coversion: a `TIME` applied to today's date; `DATE` is interpreted at midnight. +### Examples + +#### Basic Predicate Operator + +Here is an example for comparison operators + +```ppl +source=accounts +| where age > 33 +| fields age +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----+ +| age | +|-----| +| 36 | ++-----+ +``` + +The `==` operator can be used as an alternative to `=` for equality comparisons + +```ppl +source=accounts +| where age == 32 +| fields age +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----+ +| age | +|-----| +| 32 | ++-----+ +``` + +Note: Both `=` and `==` perform the same equality comparison. You can use either based on your preference. +#### IN + +IN operator test field in value lists + +```ppl +source=accounts +| where age in (32, 33) +| fields age +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++-----+ +| age | +|-----| +| 32 | +| 33 | ++-----+ +``` + +#### OR + +OR operator + +```ppl +source=accounts +| where age = 32 OR age = 33 +| fields age +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++-----+ +| age | +|-----| +| 32 | +| 33 | ++-----+ +``` + +#### NOT + +NOT operator + +```ppl +source=accounts +| where not age in (32, 33) +| fields age +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++-----+ +| age | +|-----| +| 36 | +| 28 | ++-----+ +``` + \ No newline at end of file diff --git a/docs/user/ppl/functions/expressions.rst b/docs/user/ppl/functions/expressions.rst deleted file mode 100644 index 2b30c739a45..00000000000 --- a/docs/user/ppl/functions/expressions.rst +++ /dev/null @@ -1,177 +0,0 @@ -=========== -Expressions -=========== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 3 - - -Introduction -============ - -Expressions, particularly value expressions, are those which return a scalar value. Expressions have different types and forms. For example, there are literal values as atom expression and arithmetic, predicate and function expression built on top of them. And also expressions can be used in different clauses, such as using arithmetic expression in ``Filter``, ``Stats`` command. - -Arithmetic Operators -==================== - -Description ------------ - -Operators -````````` - -Arithmetic expression is an expression formed by numeric literals and binary arithmetic operators as follows: - -1. ``+``: Add. -2. ``-``: Subtract. -3. ``*``: Multiply. -4. ``/``: Divide. Integer operands follow the legacy truncating result when - `plugins.ppl.syntax.legacy.preferred <../admin/settings.rst>`_ is ``true`` (default). When the - setting is ``false`` the operands are promoted to floating point, preserving - the fractional part. Division by zero still returns ``NULL``. -5. ``%``: Modulo. This can be used with integers only with remainder of the division as result. - -Precedence -`````````` - -Parentheses can be used to control the precedence of arithmetic operators. Otherwise, operators of higher precedence is performed first. - -Type Conversion -``````````````` - -Implicit type conversion is performed when looking up operator signature. For example, an integer ``+`` a real number matches signature ``+(double,double)`` which results in a real number. This rule also applies to function call discussed below. - -Examples --------- - -Here is an example for different type of arithmetic expressions:: - - os> source=accounts | where age > (25 + 5) | fields age ; - fetched rows / total rows = 3/3 - +-----+ - | age | - |-----| - | 32 | - | 36 | - | 33 | - +-----+ - -Predicate Operators -=================== - -Description ------------ - -Predicate operator is an expression that evaluated to be ture. The MISSING and NULL value comparison has following the rule. MISSING value only equal to MISSING value and less than all the other values. NULL value equals to NULL value, large than MISSING value, but less than all the other values. - -Operators -````````` - -+----------------+----------------------------------------+ -| name | description | -+----------------+----------------------------------------+ -| > | Greater than operator | -+----------------+----------------------------------------+ -| >= | Greater than or equal operator | -+----------------+----------------------------------------+ -| < | Less than operator | -+----------------+----------------------------------------+ -| != | Not equal operator | -+----------------+----------------------------------------+ -| <= | Less than or equal operator | -+----------------+----------------------------------------+ -| = | Equal operator | -+----------------+----------------------------------------+ -| == | Equal operator (alternative syntax) | -+----------------+----------------------------------------+ -| LIKE | Simple Pattern matching | -+----------------+----------------------------------------+ -| IN | NULL value test | -+----------------+----------------------------------------+ -| AND | AND operator | -+----------------+----------------------------------------+ -| OR | OR operator | -+----------------+----------------------------------------+ -| XOR | XOR operator | -+----------------+----------------------------------------+ -| NOT | NOT NULL value test | -+----------------+----------------------------------------+ - -It is possible to compare datetimes. When comparing different datetime types, for example `DATE` and `TIME`, both converted to `DATETIME`. -The following rule is applied on coversion: a `TIME` applied to today's date; `DATE` is interpreted at midnight. - -Examples --------- - -Basic Predicate Operator -```````````````````````` - -Here is an example for comparison operators:: - - os> source=accounts | where age > 33 | fields age ; - fetched rows / total rows = 1/1 - +-----+ - | age | - |-----| - | 36 | - +-----+ - -The ``==`` operator can be used as an alternative to ``=`` for equality comparisons:: - - os> source=accounts | where age == 32 | fields age ; - fetched rows / total rows = 1/1 - +-----+ - | age | - |-----| - | 32 | - +-----+ - -Note: Both ``=`` and ``==`` perform the same equality comparison. You can use either based on your preference. - - -IN -`` - -IN operator test field in value lists:: - - os> source=accounts | where age in (32, 33) | fields age ; - fetched rows / total rows = 2/2 - +-----+ - | age | - |-----| - | 32 | - | 33 | - +-----+ - - -OR -`` - -OR operator :: - - os> source=accounts | where age = 32 OR age = 33 | fields age ; - fetched rows / total rows = 2/2 - +-----+ - | age | - |-----| - | 32 | - | 33 | - +-----+ - - -NOT -``` - -NOT operator :: - - os> source=accounts | where not age in (32, 33) | fields age ; - fetched rows / total rows = 2/2 - +-----+ - | age | - |-----| - | 36 | - | 28 | - +-----+ diff --git a/docs/user/ppl/functions/ip.md b/docs/user/ppl/functions/ip.md new file mode 100644 index 00000000000..673a0a8d250 --- /dev/null +++ b/docs/user/ppl/functions/ip.md @@ -0,0 +1,61 @@ +# IP Address Functions + +## CIDRMATCH + +### Description + +Usage: `cidrmatch(ip, cidr)` checks if `ip` is within the specified `cidr` range. +Argument type: STRING/IP, STRING +Return type: BOOLEAN +Example + +```ppl +source=weblogs +| where cidrmatch(host, '1.2.3.0/24') +| fields host, url +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++---------+--------------------+ +| host | url | +|---------+--------------------| +| 1.2.3.4 | /history/voyager1/ | +| 1.2.3.5 | /history/voyager2/ | ++---------+--------------------+ +``` + +Note: + - `ip` can be an IPv4 or IPv6 address + - `cidr` can be an IPv4 or IPv6 block + - `ip` and `cidr` must both be valid and non-missing/non-null + +## GEOIP + +### Description + +Usage: `geoip(dataSourceName, ipAddress[, options])` to lookup location information from given IP addresses via OpenSearch GeoSpatial plugin API. +Argument type: STRING, STRING/IP, STRING +Return type: OBJECT +Example: + +```ppl ignore +source=weblogs +| eval LookupResult = geoip("dataSourceName", "50.68.18.229", "country_iso_code,city_name") +``` + +```text +fetched rows / total rows = 1/1 ++-------------------------------------------------------------+ +| LookupResult | +|-------------------------------------------------------------| +| {'city_name': 'Vancouver', 'country_iso_code': 'CA'} | ++-------------------------------------------------------------+ +``` + +Note: + - `dataSourceName` must be an established dataSource on OpenSearch GeoSpatial plugin, detail of configuration can be found: https://opensearch.org/docs/latest/ingest-pipelines/processors/ip2geo/ + - `ip` can be an IPv4 or an IPv6 address + - `options` is an optional String of comma separated fields to output: the selection of fields is subject to dataSourceProvider's schema. For example, the list of fields in the provided `geolite2-city` dataset includes: "country_iso_code", "country_name", "continent_name", "region_iso_code", "region_name", "city_name", "time_zone", "location" \ No newline at end of file diff --git a/docs/user/ppl/functions/ip.rst b/docs/user/ppl/functions/ip.rst deleted file mode 100644 index ec853c27093..00000000000 --- a/docs/user/ppl/functions/ip.rst +++ /dev/null @@ -1,69 +0,0 @@ -==================== -IP Address Functions -==================== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 1 - -CIDRMATCH ---------- - -Description ->>>>>>>>>>> - -Usage: `cidrmatch(ip, cidr)` checks if `ip` is within the specified `cidr` range. - -Argument type: STRING/IP, STRING - -Return type: BOOLEAN - -Example:: - - > source=weblogs | where cidrmatch(host, '1.2.3.0/24') | fields host, url - fetched rows / total rows = 2/2 - +---------+--------------------+ - | host | url | - |---------|--------------------| - | 1.2.3.4 | /history/voyager1/ | - | 1.2.3.5 | /history/voyager2/ | - +---------+--------------------+ - -Note: - - `ip` can be an IPv4 or IPv6 address - - `cidr` can be an IPv4 or IPv6 block - - `ip` and `cidr` must both be valid and non-missing/non-null - - -GEOIP ---------- - -Description ->>>>>>>>>>> - -Usage: `geoip(dataSourceName, ipAddress[, options])` to lookup location information from given IP addresses via OpenSearch GeoSpatial plugin API. - -Argument type: STRING, STRING/IP, STRING - -Return type: OBJECT - -.. The execution of below example is being excluded, as this requires a standalone Geo-Spatial dataSource setup, which is not yet supported by docTest. - -Example: - - > source=weblogs | eval LookupResult = geoip("dataSourceName", "50.68.18.229", "country_iso_code,city_name") - fetched rows / total rows = 1/1 - +-------------------------------------------------------------+ - | LookupResult | - |-------------------------------------------------------------| - | {'city_name': 'Vancouver', 'country_iso_code': 'CA'} | - +-------------------------------------------------------------+ - - -Note: - - `dataSourceName` must be an established dataSource on OpenSearch GeoSpatial plugin, detail of configuration can be found: https://opensearch.org/docs/latest/ingest-pipelines/processors/ip2geo/ - - `ip` can be an IPv4 or an IPv6 address - - `options` is an optional String of comma separated fields to output: the selection of fields is subject to dataSourceProvider's schema. For example, the list of fields in the provided `geolite2-city` dataset includes: "country_iso_code", "country_name", "continent_name", "region_iso_code", "region_name", "city_name", "time_zone", "location" - diff --git a/docs/user/ppl/functions/json.md b/docs/user/ppl/functions/json.md new file mode 100644 index 00000000000..e7cec247a81 --- /dev/null +++ b/docs/user/ppl/functions/json.md @@ -0,0 +1,502 @@ +# JSON Functions + +## JSON Path + +### Description + +All JSON paths used in JSON functions follow the format `{}.{}...`. +Each `` represents a field name. The `{}` part is optional and is only applicable when the corresponding key refers to an array. +For example + +```bash +a{2}.b{0} + +``` + +This refers to the element at index 0 of the `b` array, which is nested inside the element at index 2 of the `a` array. +Notes: +1. The `{}` notation applies **only when** the associated key points to an array. +2. `{}` (without a specific index) is interpreted as a **wildcard**, equivalent to `{*}`, meaning "all elements" in the array at that level. + +## JSON + +### Description + +Usage: `json(value)` Evaluates whether a string can be parsed as a json-encoded string. Returns the value if valid, null otherwise. +Argument type: STRING +Return type: STRING +Example + +```ppl +source=json_test +| where json_valid(json_string) +| eval json=json(json_string) +| fields test_name, json_string, json +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++--------------------+---------------------------------+---------------------------------+ +| test_name | json_string | json | +|--------------------+---------------------------------+---------------------------------| +| json nested object | {"a":"1","b":{"c":"2","d":"3"}} | {"a":"1","b":{"c":"2","d":"3"}} | +| json object | {"a":"1","b":"2"} | {"a":"1","b":"2"} | +| json array | [1, 2, 3, 4] | [1, 2, 3, 4] | +| json scalar string | "abc" | "abc" | ++--------------------+---------------------------------+---------------------------------+ +``` + +## JSON_VALID + +### Description + +Version: 3.1.0 +Limitation: Only works when plugins.calcite.enabled=true +Usage: `json_valid(value)` Evaluates whether a string uses valid JSON syntax. Returns TRUE if valid, FALSE if invalid. NULL input returns NULL. +Argument type: STRING +Return type: BOOLEAN +Example + +```ppl +source=people +| eval is_valid_json = json_valid('[1,2,3,4]'), is_invalid_json = json_valid('{invalid}') +| fields is_valid_json, is_invalid_json +| head 1 +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------+-----------------+ +| is_valid_json | is_invalid_json | +|---------------+-----------------| +| True | False | ++---------------+-----------------+ +``` + +## JSON_OBJECT + +### Description + +Usage: `json_object(key1, value1, key2, value2...)` create a json object string with key value pairs. The key must be string. +Argument type: key1: STRING, value1: ANY, key2: STRING, value2: ANY ... +Return type: STRING +Example + +```ppl +source=json_test +| eval test_json = json_object('key', 123.45) +| head 1 +| fields test_json +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------+ +| test_json | +|----------------| +| {"key":123.45} | ++----------------+ +``` + +## JSON_ARRAY + +### Description + +Usage: `json_array(element1, element2, ...)` create a json array string with elements. +Argument type: element1: ANY, element2: ANY ... +Return type: STRING +Example + +```ppl +source=json_test +| eval test_json_array = json_array('key', 123.45) +| head 1 +| fields test_json_array +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------+ +| test_json_array | +|-----------------| +| ["key",123.45] | ++-----------------+ +``` + +## JSON_ARRAY_LENGTH + +### Description + +Usage: `json_array_length(value)` parse the string to json array and return size,, null is returned in case of any other valid JSON string, null or an invalid JSON. +Argument type: value: A JSON STRING +Return type: INTEGER +Example + +```ppl +source=json_test +| eval array_length = json_array_length("[1,2,3]") +| head 1 +| fields array_length +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------+ +| array_length | +|--------------| +| 3 | ++--------------+ +``` + +```ppl +source=json_test +| eval array_length = json_array_length("{\"1\": 2}") +| head 1 +| fields array_length +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------+ +| array_length | +|--------------| +| null | ++--------------+ +``` + +## JSON_EXTRACT + +### Description + +Usage: `json_extract(json_string, path1, path2, ...)` Extracts values using the specified JSON paths. If only one path is provided, it returns a single value. If multiple paths are provided, it returns a JSON Array in the order of the paths. If one path cannot find value, return null as the result for this path. The path use "{}" to represent index for array, "{}" means "{*}". +Argument type: json_string: STRING, path1: STRING, path2: STRING ... +Return type: STRING +Example + +```ppl +source=json_test +| eval extract = json_extract('{"a": [{"b": 1}, {"b": 2}]}', 'a{}.b') +| head 1 +| fields extract +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------+ +| extract | +|---------| +| [1,2] | ++---------+ +``` + +```ppl +source=json_test +| eval extract = json_extract('{"a": [{"b": 1}, {"b": 2}]}', 'a{}.b', 'a{}') +| head 1 +| fields extract +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------------------+ +| extract | +|---------------------------| +| [[1,2],[{"b":1},{"b":2}]] | ++---------------------------+ +``` + +## JSON_DELETE + +### Description + +Usage: `json_delete(json_string, path1, path2, ...)` Delete values using the specified JSON paths. Return the json string after deleting. If one path cannot find value, do nothing. +Argument type: json_string: STRING, path1: STRING, path2: STRING ... +Return type: STRING +Example + +```ppl +source=json_test +| eval delete = json_delete('{"a": [{"b": 1}, {"b": 2}]}', 'a{0}.b') +| head 1 +| fields delete +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------+ +| delete | +|--------------------| +| {"a":[{},{"b":2}]} | ++--------------------+ +``` + +```ppl +source=json_test +| eval delete = json_delete('{"a": [{"b": 1}, {"b": 2}]}', 'a{0}.b', 'a{1}.b') +| head 1 +| fields delete +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------+ +| delete | +|---------------| +| {"a":[{},{}]} | ++---------------+ +``` + +```ppl +source=json_test +| eval delete = json_delete('{"a": [{"b": 1}, {"b": 2}]}', 'a{2}.b') +| head 1 +| fields delete +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------------+ +| delete | +|-------------------------| +| {"a":[{"b":1},{"b":2}]} | ++-------------------------+ +``` + +## JSON_SET + +### Description + +Usage: `json_set(json_string, path1, value1, path2, value2...)` Set values to corresponding paths using the specified JSON paths. If one path's parent node is not a json object, skip the path. Return the json string after setting. +Argument type: json_string: STRING, path1: STRING, value1: ANY, path2: STRING, value2: ANY ... +Return type: STRING +Example + +```ppl +source=json_test +| eval jsonSet = json_set('{"a": [{"b": 1}]}', 'a{0}.b', 3) +| head 1 +| fields jsonSet +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------+ +| jsonSet | +|-----------------| +| {"a":[{"b":3}]} | ++-----------------+ +``` + +```ppl +source=json_test +| eval jsonSet = json_set('{"a": [{"b": 1}, {"b": 2}]}', 'a{0}.b', 3, 'a{1}.b', 4) +| head 1 +| fields jsonSet +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------------+ +| jsonSet | +|-------------------------| +| {"a":[{"b":3},{"b":4}]} | ++-------------------------+ +``` + +## JSON_APPEND + +### Description + +Usage: `json_append(json_string, path1, value1, path2, value2...)` Append values to corresponding paths using the specified JSON paths. If one path's target node is not an array, skip the path. Return the json string after setting. +Argument type: json_string: STRING, path1: STRING, value1: ANY, path2: STRING, value2: ANY ... +Return type: STRING +Example + +```ppl +source=json_test +| eval jsonAppend = json_set('{"a": [{"b": 1}]}', 'a', 3) +| head 1 +| fields jsonAppend +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++------------+ +| jsonAppend | +|------------| +| {"a":3} | ++------------+ +``` + +```ppl +source=json_test +| eval jsonAppend = json_append('{"a": [{"b": 1}, {"b": 2}]}', 'a{0}.b', 3, 'a{1}.b', 4) +| head 1 +| fields jsonAppend +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------------+ +| jsonAppend | +|-------------------------| +| {"a":[{"b":1},{"b":2}]} | ++-------------------------+ +``` + +```ppl +source=json_test +| eval jsonAppend = json_append('{"a": [{"b": 1}]}', 'a', '[1,2]', 'a{1}.b', 4) +| head 1 +| fields jsonAppend +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------------+ +| jsonAppend | +|-------------------------| +| {"a":[{"b":1},"[1,2]"]} | ++-------------------------+ +``` + +## JSON_EXTEND + +### Description + +Usage: `json_extend(json_string, path1, value1, path2, value2...)` Extend values to corresponding paths using the specified JSON paths. If one path's target node is not an array, skip the path. The function will try to parse the value as an array. If it can be parsed, extend it to the target array. Otherwise, regard the value a single one. Return the json string after setting. +Argument type: json_string: STRING, path1: STRING, value1: ANY, path2: STRING, value2: ANY ... +Return type: STRING +Example + +```ppl +source=json_test +| eval jsonExtend = json_extend('{"a": [{"b": 1}]}', 'a', 3) +| head 1 +| fields jsonExtend +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------+ +| jsonExtend | +|-------------------| +| {"a":[{"b":1},3]} | ++-------------------+ +``` + +```ppl +source=json_test +| eval jsonExtend = json_extend('{"a": [{"b": 1}, {"b": 2}]}', 'a{0}.b', 3, 'a{1}.b', 4) +| head 1 +| fields jsonExtend +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------------+ +| jsonExtend | +|-------------------------| +| {"a":[{"b":1},{"b":2}]} | ++-------------------------+ +``` + +```ppl +source=json_test +| eval jsonExtend = json_extend('{"a": [{"b": 1}]}', 'a', '[1,2]') +| head 1 +| fields jsonExtend +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------------+ +| jsonExtend | +|-------------------------| +| {"a":[{"b":1},1.0,2.0]} | ++-------------------------+ +``` + +## JSON_KEYS + +### Description + +Usage: `json_keys(json_string)` Return the key list of the Json object as a Json array. Otherwise, return null. +Argument type: json_string: A JSON STRING +Return type: STRING +Example + +```ppl +source=json_test +| eval jsonKeys = json_keys('{"a": 1, "b": 2}') +| head 1 +| fields jsonKeys +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------+ +| jsonKeys | +|-----------| +| ["a","b"] | ++-----------+ +``` + +```ppl +source=json_test +| eval jsonKeys = json_keys('{"a": {"c": 1}, "b": 2}') +| head 1 +| fields jsonKeys +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------+ +| jsonKeys | +|-----------| +| ["a","b"] | ++-----------+ +``` + \ No newline at end of file diff --git a/docs/user/ppl/functions/json.rst b/docs/user/ppl/functions/json.rst deleted file mode 100644 index 61377847e04..00000000000 --- a/docs/user/ppl/functions/json.rst +++ /dev/null @@ -1,363 +0,0 @@ -==================== -JSON Functions -==================== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 1 - - - - -JSON Path ---------- - -Description ->>>>>>>>>>> - -All JSON paths used in JSON functions follow the format ``{}.{}...``. - -Each ```` represents a field name. The ``{}`` part is optional and is only applicable when the corresponding key refers to an array. - -For example:: - - a{2}.b{0} - -This refers to the element at index 0 of the ``b`` array, which is nested inside the element at index 2 of the ``a`` array. - -Notes: - -1. The ``{}`` notation applies **only when** the associated key points to an array. - -2. ``{}`` (without a specific index) is interpreted as a **wildcard**, equivalent to ``{*}``, meaning "all elements" in the array at that level. - -JSON ----------- - -Description ->>>>>>>>>>> - -Usage: `json(value)` Evaluates whether a string can be parsed as a json-encoded string. Returns the value if valid, null otherwise. - -Argument type: STRING - -Return type: STRING - -Example:: - - os> source=json_test | where json_valid(json_string) | eval json=json(json_string) | fields test_name, json_string, json - fetched rows / total rows = 4/4 - +--------------------+---------------------------------+---------------------------------+ - | test_name | json_string | json | - |--------------------+---------------------------------+---------------------------------| - | json nested object | {"a":"1","b":{"c":"2","d":"3"}} | {"a":"1","b":{"c":"2","d":"3"}} | - | json object | {"a":"1","b":"2"} | {"a":"1","b":"2"} | - | json array | [1, 2, 3, 4] | [1, 2, 3, 4] | - | json scalar string | "abc" | "abc" | - +--------------------+---------------------------------+---------------------------------+ - -JSON_VALID ----------- - -Description ->>>>>>>>>>> - -Version: 3.1.0 - -Limitation: Only works when plugins.calcite.enabled=true - -Usage: `json_valid(value)` Evaluates whether a string uses valid JSON syntax. Returns TRUE if valid, FALSE if invalid. NULL input returns NULL. - -Argument type: STRING - -Return type: BOOLEAN - -Example:: - - os> source=people | eval is_valid_json = json_valid('[1,2,3,4]'), is_invalid_json = json_valid('{invalid}') | fields is_valid_json, is_invalid_json | head 1 - fetched rows / total rows = 1/1 - +---------------+-----------------+ - | is_valid_json | is_invalid_json | - |---------------+-----------------| - | True | False | - +---------------+-----------------+ - -JSON_OBJECT ----------- - -Description ->>>>>>>>>>> - -Usage: `json_object(key1, value1, key2, value2...)` create a json object string with key value pairs. The key must be string. - -Argument type: key1: STRING, value1: ANY, key2: STRING, value2: ANY ... - -Return type: STRING - -Example:: - - os> source=json_test | eval test_json = json_object('key', 123.45) | head 1 | fields test_json - fetched rows / total rows = 1/1 - +----------------+ - | test_json | - |----------------| - | {"key":123.45} | - +----------------+ - -JSON_ARRAY ----------- - -Description ->>>>>>>>>>> - -Usage: `json_array(element1, element2, ...)` create a json array string with elements. - -Argument type: element1: ANY, element2: ANY ... - -Return type: STRING - -Example:: - - os> source=json_test | eval test_json_array = json_array('key', 123.45) | head 1 | fields test_json_array - fetched rows / total rows = 1/1 - +-----------------+ - | test_json_array | - |-----------------| - | ["key",123.45] | - +-----------------+ - -JSON_ARRAY_LENGTH ----------- - -Description ->>>>>>>>>>> - -Usage: `json_array_length(value)` parse the string to json array and return size,, null is returned in case of any other valid JSON string, null or an invalid JSON. - -Argument type: value: A JSON STRING - -Return type: INTEGER - -Example:: - - os> source=json_test | eval array_length = json_array_length("[1,2,3]") | head 1 | fields array_length - fetched rows / total rows = 1/1 - +--------------+ - | array_length | - |--------------| - | 3 | - +--------------+ - - os> source=json_test | eval array_length = json_array_length("{\"1\": 2}") | head 1 | fields array_length - fetched rows / total rows = 1/1 - +--------------+ - | array_length | - |--------------| - | null | - +--------------+ - -JSON_EXTRACT ----------- - -Description ->>>>>>>>>>> - -Usage: `json_extract(json_string, path1, path2, ...)` Extracts values using the specified JSON paths. If only one path is provided, it returns a single value. If multiple paths are provided, it returns a JSON Array in the order of the paths. If one path cannot find value, return null as the result for this path. The path use "{}" to represent index for array, "{}" means "{*}". - -Argument type: json_string: STRING, path1: STRING, path2: STRING ... - -Return type: STRING - -Example:: - - os> source=json_test | eval extract = json_extract('{"a": [{"b": 1}, {"b": 2}]}', 'a{}.b') | head 1 | fields extract - fetched rows / total rows = 1/1 - +---------+ - | extract | - |---------| - | [1,2] | - +---------+ - - os> source=json_test | eval extract = json_extract('{"a": [{"b": 1}, {"b": 2}]}', 'a{}.b', 'a{}') | head 1 | fields extract - fetched rows / total rows = 1/1 - +---------------------------+ - | extract | - |---------------------------| - | [[1,2],[{"b":1},{"b":2}]] | - +---------------------------+ - -JSON_DELETE ----------- - -Description ->>>>>>>>>>> - -Usage: `json_delete(json_string, path1, path2, ...)` Delete values using the specified JSON paths. Return the json string after deleting. If one path cannot find value, do nothing. - -Argument type: json_string: STRING, path1: STRING, path2: STRING ... - -Return type: STRING - -Example:: - - os> source=json_test | eval delete = json_delete('{"a": [{"b": 1}, {"b": 2}]}', 'a{0}.b') | head 1 | fields delete - fetched rows / total rows = 1/1 - +--------------------+ - | delete | - |--------------------| - | {"a":[{},{"b":2}]} | - +--------------------+ - - os> source=json_test | eval delete = json_delete('{"a": [{"b": 1}, {"b": 2}]}', 'a{0}.b', 'a{1}.b') | head 1 | fields delete - fetched rows / total rows = 1/1 - +---------------+ - | delete | - |---------------| - | {"a":[{},{}]} | - +---------------+ - - os> source=json_test | eval delete = json_delete('{"a": [{"b": 1}, {"b": 2}]}', 'a{2}.b') | head 1 | fields delete - fetched rows / total rows = 1/1 - +-------------------------+ - | delete | - |-------------------------| - | {"a":[{"b":1},{"b":2}]} | - +-------------------------+ - -JSON_SET ----------- - -Description ->>>>>>>>>>> - -Usage: `json_set(json_string, path1, value1, path2, value2...)` Set values to corresponding paths using the specified JSON paths. If one path's parent node is not a json object, skip the path. Return the json string after setting. - -Argument type: json_string: STRING, path1: STRING, value1: ANY, path2: STRING, value2: ANY ... - -Return type: STRING - -Example:: - - os> source=json_test | eval jsonSet = json_set('{"a": [{"b": 1}]}', 'a{0}.b', 3) | head 1 | fields jsonSet - fetched rows / total rows = 1/1 - +-----------------+ - | jsonSet | - |-----------------| - | {"a":[{"b":3}]} | - +-----------------+ - - os> source=json_test | eval jsonSet = json_set('{"a": [{"b": 1}, {"b": 2}]}', 'a{0}.b', 3, 'a{1}.b', 4) | head 1 | fields jsonSet - fetched rows / total rows = 1/1 - +-------------------------+ - | jsonSet | - |-------------------------| - | {"a":[{"b":3},{"b":4}]} | - +-------------------------+ - -JSON_APPEND ----------- - -Description ->>>>>>>>>>> - -Usage: `json_append(json_string, path1, value1, path2, value2...)` Append values to corresponding paths using the specified JSON paths. If one path's target node is not an array, skip the path. Return the json string after setting. - -Argument type: json_string: STRING, path1: STRING, value1: ANY, path2: STRING, value2: ANY ... - -Return type: STRING - -Example:: - - os> source=json_test | eval jsonAppend = json_set('{"a": [{"b": 1}]}', 'a', 3) | head 1 | fields jsonAppend - fetched rows / total rows = 1/1 - +------------+ - | jsonAppend | - |------------| - | {"a":3} | - +------------+ - - os> source=json_test | eval jsonAppend = json_append('{"a": [{"b": 1}, {"b": 2}]}', 'a{0}.b', 3, 'a{1}.b', 4) | head 1 | fields jsonAppend - fetched rows / total rows = 1/1 - +-------------------------+ - | jsonAppend | - |-------------------------| - | {"a":[{"b":1},{"b":2}]} | - +-------------------------+ - - os> source=json_test | eval jsonAppend = json_append('{"a": [{"b": 1}]}', 'a', '[1,2]', 'a{1}.b', 4) | head 1 | fields jsonAppend - fetched rows / total rows = 1/1 - +-------------------------+ - | jsonAppend | - |-------------------------| - | {"a":[{"b":1},"[1,2]"]} | - +-------------------------+ - -JSON_EXTEND ----------- - -Description ->>>>>>>>>>> - -Usage: `json_extend(json_string, path1, value1, path2, value2...)` Extend values to corresponding paths using the specified JSON paths. If one path's target node is not an array, skip the path. The function will try to parse the value as an array. If it can be parsed, extend it to the target array. Otherwise, regard the value a single one. Return the json string after setting. - -Argument type: json_string: STRING, path1: STRING, value1: ANY, path2: STRING, value2: ANY ... - -Return type: STRING - -Example:: - - os> source=json_test | eval jsonExtend = json_extend('{"a": [{"b": 1}]}', 'a', 3) | head 1 | fields jsonExtend - fetched rows / total rows = 1/1 - +-------------------+ - | jsonExtend | - |-------------------| - | {"a":[{"b":1},3]} | - +-------------------+ - - os> source=json_test | eval jsonExtend = json_extend('{"a": [{"b": 1}, {"b": 2}]}', 'a{0}.b', 3, 'a{1}.b', 4) | head 1 | fields jsonExtend - fetched rows / total rows = 1/1 - +-------------------------+ - | jsonExtend | - |-------------------------| - | {"a":[{"b":1},{"b":2}]} | - +-------------------------+ - - os> source=json_test | eval jsonExtend = json_extend('{"a": [{"b": 1}]}', 'a', '[1,2]') | head 1 | fields jsonExtend - fetched rows / total rows = 1/1 - +-------------------------+ - | jsonExtend | - |-------------------------| - | {"a":[{"b":1},1.0,2.0]} | - +-------------------------+ - -JSON_KEYS ----------- - -Description ->>>>>>>>>>> - -Usage: `json_keys(json_string)` Return the key list of the Json object as a Json array. Otherwise, return null. - -Argument type: json_string: A JSON STRING - -Return type: STRING - -Example:: - - os> source=json_test | eval jsonKeys = json_keys('{"a": 1, "b": 2}') | head 1 | fields jsonKeys - fetched rows / total rows = 1/1 - +-----------+ - | jsonKeys | - |-----------| - | ["a","b"] | - +-----------+ - - os> source=json_test | eval jsonKeys = json_keys('{"a": {"c": 1}, "b": 2}') | head 1 | fields jsonKeys - fetched rows / total rows = 1/1 - +-----------+ - | jsonKeys | - |-----------| - | ["a","b"] | - +-----------+ diff --git a/docs/user/ppl/functions/math.md b/docs/user/ppl/functions/math.md new file mode 100644 index 00000000000..6b2fe319df1 --- /dev/null +++ b/docs/user/ppl/functions/math.md @@ -0,0 +1,1187 @@ +# Mathematical Functions + +## ABS + +### Description + +Usage: abs(x) calculates the abs x. +Argument type: INTEGER/LONG/FLOAT/DOUBLE +Return type: INTEGER/LONG/FLOAT/DOUBLE +Example + +```ppl +source=people +| eval `ABS(-1)` = ABS(-1) +| fields `ABS(-1)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------+ +| ABS(-1) | +|---------| +| 1 | ++---------+ +``` + +## ADD + +### Description + +Usage: add(x, y) calculates x plus y. +Argument type: INTEGER/LONG/FLOAT/DOUBLE, INTEGER/LONG/FLOAT/DOUBLE +Return type: Wider number between x and y +Synonyms: Addition Symbol (+) +Example + +```ppl +source=people +| eval `ADD(2, 1)` = ADD(2, 1) +| fields `ADD(2, 1)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------+ +| ADD(2, 1) | +|-----------| +| 3 | ++-----------+ +``` + +## SUBTRACT + +### Description + +Usage: subtract(x, y) calculates x minus y. +Argument type: INTEGER/LONG/FLOAT/DOUBLE, INTEGER/LONG/FLOAT/DOUBLE +Return type: Wider number between x and y +Synonyms: Subtraction Symbol (-) +Example + +```ppl +source=people +| eval `SUBTRACT(2, 1)` = SUBTRACT(2, 1) +| fields `SUBTRACT(2, 1)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------+ +| SUBTRACT(2, 1) | +|----------------| +| 1 | ++----------------+ +``` + +## MULTIPLY + +### Description + +Usage: multiply(x, y) calculates the multiplication of x and y. +Argument type: INTEGER/LONG/FLOAT/DOUBLE, INTEGER/LONG/FLOAT/DOUBLE +Return type: Wider number between x and y. If y equals to 0, then returns NULL. +Synonyms: Multiplication Symbol (\*) +Example + +```ppl +source=people +| eval `MULTIPLY(2, 1)` = MULTIPLY(2, 1) +| fields `MULTIPLY(2, 1)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------+ +| MULTIPLY(2, 1) | +|----------------| +| 2 | ++----------------+ +``` + +## DIVIDE + +### Description + +Usage: divide(x, y) calculates x divided by y. +Argument type: INTEGER/LONG/FLOAT/DOUBLE, INTEGER/LONG/FLOAT/DOUBLE +Return type: Wider number between x and y +Synonyms: Division Symbol (/) +Example + +```ppl +source=people +| eval `DIVIDE(2, 1)` = DIVIDE(2, 1) +| fields `DIVIDE(2, 1)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------+ +| DIVIDE(2, 1) | +|--------------| +| 2 | ++--------------+ +``` + +## SUM + +### Description + +Usage: sum(x, y, ...) calculates the sum of all provided arguments. This function accepts a variable number of arguments. +Note: This function is only available in the eval command context and is rewritten to arithmetic addition while query parsing. +Argument type: Variable number of INTEGER/LONG/FLOAT/DOUBLE arguments +Return type: Wider number type among all arguments +Example + +```ppl +source=accounts +| eval `SUM(1, 2, 3)` = SUM(1, 2, 3) +| fields `SUM(1, 2, 3)` +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++--------------+ +| SUM(1, 2, 3) | +|--------------| +| 6 | +| 6 | +| 6 | +| 6 | ++--------------+ +``` + +```ppl +source=accounts +| eval total = SUM(age, 10, 5) +| fields age, total +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----+-------+ +| age | total | +|-----+-------| +| 32 | 47 | +| 36 | 51 | +| 28 | 43 | +| 33 | 48 | ++-----+-------+ +``` + +## AVG + +### Description + +Usage: avg(x, y, ...) calculates the average (arithmetic mean) of all provided arguments. This function accepts a variable number of arguments. +Note: This function is only available in the eval command context and is rewritten to arithmetic expression (sum / count) at query parsing time. +Argument type: Variable number of INTEGER/LONG/FLOAT/DOUBLE arguments +Return type: DOUBLE +Example + +```ppl +source=accounts +| eval `AVG(1, 2, 3)` = AVG(1, 2, 3) +| fields `AVG(1, 2, 3)` +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++--------------+ +| AVG(1, 2, 3) | +|--------------| +| 2.0 | +| 2.0 | +| 2.0 | +| 2.0 | ++--------------+ +``` + +```ppl +source=accounts +| eval average = AVG(age, 30) +| fields age, average +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----+---------+ +| age | average | +|-----+---------| +| 32 | 31.0 | +| 36 | 33.0 | +| 28 | 29.0 | +| 33 | 31.5 | ++-----+---------+ +``` + +## ACOS + +### Description + +Usage: acos(x) calculates the arc cosine of x. Returns NULL if x is not in the range -1 to 1. +Argument type: INTEGER/LONG/FLOAT/DOUBLE +Return type: DOUBLE +Example + +```ppl +source=people +| eval `ACOS(0)` = ACOS(0) +| fields `ACOS(0)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------+ +| ACOS(0) | +|--------------------| +| 1.5707963267948966 | ++--------------------+ +``` + +## ASIN + +### Description + +Usage: asin(x) calculate the arc sine of x. Returns NULL if x is not in the range -1 to 1. +Argument type: INTEGER/LONG/FLOAT/DOUBLE +Return type: DOUBLE +Example + +```ppl +source=people +| eval `ASIN(0)` = ASIN(0) +| fields `ASIN(0)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------+ +| ASIN(0) | +|---------| +| 0.0 | ++---------+ +``` + +## ATAN + +### Description + +Usage: atan(x) calculates the arc tangent of x. atan(y, x) calculates the arc tangent of y / x, except that the signs of both arguments are used to determine the quadrant of the result. +Argument type: INTEGER/LONG/FLOAT/DOUBLE +Return type: DOUBLE +Example + +```ppl +source=people +| eval `ATAN(2)` = ATAN(2), `ATAN(2, 3)` = ATAN(2, 3) +| fields `ATAN(2)`, `ATAN(2, 3)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------+--------------------+ +| ATAN(2) | ATAN(2, 3) | +|--------------------+--------------------| +| 1.1071487177940904 | 0.5880026035475675 | ++--------------------+--------------------+ +``` + +## ATAN2 + +### Description + +Usage: atan2(y, x) calculates the arc tangent of y / x, except that the signs of both arguments are used to determine the quadrant of the result. +Argument type: INTEGER/LONG/FLOAT/DOUBLE, INTEGER/LONG/FLOAT/DOUBLE +Return type: DOUBLE +Example + +```ppl +source=people +| eval `ATAN2(2, 3)` = ATAN2(2, 3) +| fields `ATAN2(2, 3)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------+ +| ATAN2(2, 3) | +|--------------------| +| 0.5880026035475675 | ++--------------------+ +``` + +## CEIL + +An alias for [CEILING](#ceiling) function. +## CEILING + +### Description + +Usage: CEILING(T) takes the ceiling of value T. +Note: [CEIL](#ceil) and CEILING functions have the same implementation & functionality +Limitation: CEILING only works as expected when IEEE 754 double type displays decimal when stored. +Argument type: INTEGER/LONG/FLOAT/DOUBLE +Return type: same type with input +Example + +```ppl +source=people +| eval `CEILING(0)` = CEILING(0), `CEILING(50.00005)` = CEILING(50.00005), `CEILING(-50.00005)` = CEILING(-50.00005) +| fields `CEILING(0)`, `CEILING(50.00005)`, `CEILING(-50.00005)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++------------+-------------------+--------------------+ +| CEILING(0) | CEILING(50.00005) | CEILING(-50.00005) | +|------------+-------------------+--------------------| +| 0 | 51.0 | -50.0 | ++------------+-------------------+--------------------+ +``` + +```ppl +source=people +| eval `CEILING(3147483647.12345)` = CEILING(3147483647.12345), `CEILING(113147483647.12345)` = CEILING(113147483647.12345), `CEILING(3147483647.00001)` = CEILING(3147483647.00001) +| fields `CEILING(3147483647.12345)`, `CEILING(113147483647.12345)`, `CEILING(3147483647.00001)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------------------+-----------------------------+---------------------------+ +| CEILING(3147483647.12345) | CEILING(113147483647.12345) | CEILING(3147483647.00001) | +|---------------------------+-----------------------------+---------------------------| +| 3147483648.0 | 113147483648.0 | 3147483648.0 | ++---------------------------+-----------------------------+---------------------------+ +``` + +## CONV + +### Description + +Usage: CONV(x, a, b) converts the number x from a base to b base. +Argument type: x: STRING, a: INTEGER, b: INTEGER +Return type: STRING +Example + +```ppl +source=people +| eval `CONV('12', 10, 16)` = CONV('12', 10, 16), `CONV('2C', 16, 10)` = CONV('2C', 16, 10), `CONV(12, 10, 2)` = CONV(12, 10, 2), `CONV(1111, 2, 10)` = CONV(1111, 2, 10) +| fields `CONV('12', 10, 16)`, `CONV('2C', 16, 10)`, `CONV(12, 10, 2)`, `CONV(1111, 2, 10)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------+--------------------+-----------------+-------------------+ +| CONV('12', 10, 16) | CONV('2C', 16, 10) | CONV(12, 10, 2) | CONV(1111, 2, 10) | +|--------------------+--------------------+-----------------+-------------------| +| c | 44 | 1100 | 15 | ++--------------------+--------------------+-----------------+-------------------+ +``` + +## COS + +### Description + +Usage: cos(x) calculates the cosine of x, where x is given in radians. +Argument type: INTEGER/LONG/FLOAT/DOUBLE +Return type: DOUBLE +Example + +```ppl +source=people +| eval `COS(0)` = COS(0) +| fields `COS(0)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------+ +| COS(0) | +|--------| +| 1.0 | ++--------+ +``` + +## COSH + +### Description + +Usage: cosh(x) calculates the hyperbolic cosine of x, defined as (((e^x) + (e^(-x))) / 2). +Argument type: INTEGER/LONG/FLOAT/DOUBLE +Return type: DOUBLE +Example + +```ppl +source=people +| eval `COSH(2)` = COSH(2) +| fields `COSH(2)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------+ +| COSH(2) | +|--------------------| +| 3.7621956910836314 | ++--------------------+ +``` + +## COT + +### Description + +Usage: cot(x) calculates the cotangent of x. Returns out-of-range error if x equals to 0. +Argument type: INTEGER/LONG/FLOAT/DOUBLE +Return type: DOUBLE +Example + +```ppl +source=people +| eval `COT(1)` = COT(1) +| fields `COT(1)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------+ +| COT(1) | +|--------------------| +| 0.6420926159343306 | ++--------------------+ +``` + +## CRC32 + +### Description + +Usage: Calculates a cyclic redundancy check value and returns a 32-bit unsigned value. +Argument type: STRING +Return type: LONG +Example + +```ppl +source=people +| eval `CRC32('MySQL')` = CRC32('MySQL') +| fields `CRC32('MySQL')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------+ +| CRC32('MySQL') | +|----------------| +| 3259397556 | ++----------------+ +``` + +## DEGREES + +### Description + +Usage: degrees(x) converts x from radians to degrees. +Argument type: INTEGER/LONG/FLOAT/DOUBLE +Return type: DOUBLE +Example + +```ppl +source=people +| eval `DEGREES(1.57)` = DEGREES(1.57) +| fields `DEGREES(1.57)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------+ +| DEGREES(1.57) | +|-------------------| +| 89.95437383553924 | ++-------------------+ +``` + +## E + +### Description + +Usage: E() returns the Euler's number +Return type: DOUBLE +Example + +```ppl +source=people +| eval `E()` = E() +| fields `E()` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------+ +| E() | +|-------------------| +| 2.718281828459045 | ++-------------------+ +``` + +## EXP + +### Description + +Usage: exp(x) return e raised to the power of x. +Argument type: INTEGER/LONG/FLOAT/DOUBLE +Return type: DOUBLE +Example + +```ppl +source=people +| eval `EXP(2)` = EXP(2) +| fields `EXP(2)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++------------------+ +| EXP(2) | +|------------------| +| 7.38905609893065 | ++------------------+ +``` + +## EXPM1 + +### Description + +Usage: expm1(NUMBER T) returns the exponential of T, minus 1. +Argument type: INTEGER/LONG/FLOAT/DOUBLE +Return type: DOUBLE +Example + +```ppl +source=people +| eval `EXPM1(1)` = EXPM1(1) +| fields `EXPM1(1)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------+ +| EXPM1(1) | +|-------------------| +| 1.718281828459045 | ++-------------------+ +``` + +## FLOOR + +### Description + +Usage: FLOOR(T) takes the floor of value T. +Limitation: FLOOR only works as expected when IEEE 754 double type displays decimal when stored. +Argument type: a: INTEGER/LONG/FLOAT/DOUBLE +Return type: same type with input +Example + +```ppl +source=people +| eval `FLOOR(0)` = FLOOR(0), `FLOOR(50.00005)` = FLOOR(50.00005), `FLOOR(-50.00005)` = FLOOR(-50.00005) +| fields `FLOOR(0)`, `FLOOR(50.00005)`, `FLOOR(-50.00005)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------+-----------------+------------------+ +| FLOOR(0) | FLOOR(50.00005) | FLOOR(-50.00005) | +|----------+-----------------+------------------| +| 0 | 50.0 | -51.0 | ++----------+-----------------+------------------+ +``` + +```ppl +source=people +| eval `FLOOR(3147483647.12345)` = FLOOR(3147483647.12345), `FLOOR(113147483647.12345)` = FLOOR(113147483647.12345), `FLOOR(3147483647.00001)` = FLOOR(3147483647.00001) +| fields `FLOOR(3147483647.12345)`, `FLOOR(113147483647.12345)`, `FLOOR(3147483647.00001)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------------+---------------------------+-------------------------+ +| FLOOR(3147483647.12345) | FLOOR(113147483647.12345) | FLOOR(3147483647.00001) | +|-------------------------+---------------------------+-------------------------| +| 3147483647.0 | 113147483647.0 | 3147483647.0 | ++-------------------------+---------------------------+-------------------------+ +``` + +```ppl +source=people +| eval `FLOOR(282474973688888.022)` = FLOOR(282474973688888.022), `FLOOR(9223372036854775807.022)` = FLOOR(9223372036854775807.022), `FLOOR(9223372036854775807.0000001)` = FLOOR(9223372036854775807.0000001) +| fields `FLOOR(282474973688888.022)`, `FLOOR(9223372036854775807.022)`, `FLOOR(9223372036854775807.0000001)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------------------+--------------------------------+------------------------------------+ +| FLOOR(282474973688888.022) | FLOOR(9223372036854775807.022) | FLOOR(9223372036854775807.0000001) | +|----------------------------+--------------------------------+------------------------------------| +| 282474973688888.0 | 9.223372036854776e+18 | 9.223372036854776e+18 | ++----------------------------+--------------------------------+------------------------------------+ +``` + +## LN + +### Description + +Usage: ln(x) return the the natural logarithm of x. +Argument type: INTEGER/LONG/FLOAT/DOUBLE +Return type: DOUBLE +Example + +```ppl +source=people +| eval `LN(2)` = LN(2) +| fields `LN(2)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------+ +| LN(2) | +|--------------------| +| 0.6931471805599453 | ++--------------------+ +``` + +## LOG + +### Description + +Specifications: +Usage: log(x) returns the natural logarithm of x that is the base e logarithm of the x. log(B, x) is equivalent to log(x)/log(B). +Argument type: INTEGER/LONG/FLOAT/DOUBLE +Return type: DOUBLE +Example + +```ppl +source=people +| eval `LOG(2)` = LOG(2), `LOG(2, 8)` = LOG(2, 8) +| fields `LOG(2)`, `LOG(2, 8)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------+-----------+ +| LOG(2) | LOG(2, 8) | +|--------------------+-----------| +| 0.6931471805599453 | 3.0 | ++--------------------+-----------+ +``` + +## LOG2 + +### Description + +Specifications: +Usage: log2(x) is equivalent to log(x)/log(2). +Argument type: INTEGER/LONG/FLOAT/DOUBLE +Return type: DOUBLE +Example + +```ppl +source=people +| eval `LOG2(8)` = LOG2(8) +| fields `LOG2(8)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------+ +| LOG2(8) | +|---------| +| 3.0 | ++---------+ +``` + +## LOG10 + +### Description + +Specifications: +Usage: log10(x) is equivalent to log(x)/log(10). +Argument type: INTEGER/LONG/FLOAT/DOUBLE +Return type: DOUBLE +Example + +```ppl +source=people +| eval `LOG10(100)` = LOG10(100) +| fields `LOG10(100)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++------------+ +| LOG10(100) | +|------------| +| 2.0 | ++------------+ +``` + +## MOD + +### Description + +Usage: MOD(n, m) calculates the remainder of the number n divided by m. +Argument type: INTEGER/LONG/FLOAT/DOUBLE, INTEGER/LONG/FLOAT/DOUBLE +Return type: Wider type between types of n and m if m is nonzero value. If m equals to 0, then returns NULL. +Example + +```ppl +source=people +| eval `MOD(3, 2)` = MOD(3, 2), `MOD(3.1, 2)` = MOD(3.1, 2) +| fields `MOD(3, 2)`, `MOD(3.1, 2)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------+-------------+ +| MOD(3, 2) | MOD(3.1, 2) | +|-----------+-------------| +| 1 | 1.1 | ++-----------+-------------+ +``` + +## MODULUS + +### Description + +Usage: MODULUS(n, m) calculates the remainder of the number n divided by m. +Argument type: INTEGER/LONG/FLOAT/DOUBLE, INTEGER/LONG/FLOAT/DOUBLE +Return type: Wider type between types of n and m if m is nonzero value. If m equals to 0, then returns NULL. +Example + +```ppl +source=people +| eval `MODULUS(3, 2)` = MODULUS(3, 2), `MODULUS(3.1, 2)` = MODULUS(3.1, 2) +| fields `MODULUS(3, 2)`, `MODULUS(3.1, 2)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------+-----------------+ +| MODULUS(3, 2) | MODULUS(3.1, 2) | +|---------------+-----------------| +| 1 | 1.1 | ++---------------+-----------------+ +``` + +## PI + +### Description + +Usage: PI() returns the constant pi +Return type: DOUBLE +Example + +```ppl +source=people +| eval `PI()` = PI() +| fields `PI()` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------+ +| PI() | +|-------------------| +| 3.141592653589793 | ++-------------------+ +``` + +## POW + +### Description + +Usage: POW(x, y) calculates the value of x raised to the power of y. Bad inputs return NULL result. +Argument type: INTEGER/LONG/FLOAT/DOUBLE, INTEGER/LONG/FLOAT/DOUBLE +Return type: DOUBLE +Synonyms: [POWER](#power) +Example + +```ppl +source=people +| eval `POW(3, 2)` = POW(3, 2), `POW(-3, 2)` = POW(-3, 2), `POW(3, -2)` = POW(3, -2) +| fields `POW(3, 2)`, `POW(-3, 2)`, `POW(3, -2)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------+------------+--------------------+ +| POW(3, 2) | POW(-3, 2) | POW(3, -2) | +|-----------+------------+--------------------| +| 9.0 | 9.0 | 0.1111111111111111 | ++-----------+------------+--------------------+ +``` + +## POWER + +### Description + +Usage: POWER(x, y) calculates the value of x raised to the power of y. Bad inputs return NULL result. +Argument type: INTEGER/LONG/FLOAT/DOUBLE, INTEGER/LONG/FLOAT/DOUBLE +Return type: DOUBLE +Synonyms: [POW](#pow) +Example + +```ppl +source=people +| eval `POWER(3, 2)` = POWER(3, 2), `POWER(-3, 2)` = POWER(-3, 2), `POWER(3, -2)` = POWER(3, -2) +| fields `POWER(3, 2)`, `POWER(-3, 2)`, `POWER(3, -2)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------+--------------+--------------------+ +| POWER(3, 2) | POWER(-3, 2) | POWER(3, -2) | +|-------------+--------------+--------------------| +| 9.0 | 9.0 | 0.1111111111111111 | ++-------------+--------------+--------------------+ +``` + +## RADIANS + +### Description + +Usage: radians(x) converts x from degrees to radians. +Argument type: INTEGER/LONG/FLOAT/DOUBLE +Return type: DOUBLE +Example + +```ppl +source=people +| eval `RADIANS(90)` = RADIANS(90) +| fields `RADIANS(90)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------+ +| RADIANS(90) | +|--------------------| +| 1.5707963267948966 | ++--------------------+ +``` + +## RAND + +### Description + +Usage: RAND()/RAND(N) returns a random floating-point value in the range 0 <= value < 1.0. If integer N is specified, the seed is initialized prior to execution. One implication of this behavior is with identical argument N, rand(N) returns the same value each time, and thus produces a repeatable sequence of column values. +Argument type: INTEGER +Return type: FLOAT +Example + +```ppl +source=people +| eval `RAND(3)` = RAND(3) +| fields `RAND(3)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------------+ +| RAND(3) | +|---------------------| +| 0.34346429521113886 | ++---------------------+ +``` + +## ROUND + +### Description + +Usage: ROUND(x, d) rounds the argument x to d decimal places, d defaults to 0 if not specified +Argument type: INTEGER/LONG/FLOAT/DOUBLE +Return type map: +(INTEGER/LONG [,INTEGER]) -> LONG +(FLOAT/DOUBLE [,INTEGER]) -> LONG +Example + +```ppl +source=people +| eval `ROUND(12.34)` = ROUND(12.34), `ROUND(12.34, 1)` = ROUND(12.34, 1), `ROUND(12.34, -1)` = ROUND(12.34, -1), `ROUND(12, 1)` = ROUND(12, 1) +| fields `ROUND(12.34)`, `ROUND(12.34, 1)`, `ROUND(12.34, -1)`, `ROUND(12, 1)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------+-----------------+------------------+--------------+ +| ROUND(12.34) | ROUND(12.34, 1) | ROUND(12.34, -1) | ROUND(12, 1) | +|--------------+-----------------+------------------+--------------| +| 12.0 | 12.3 | 10.0 | 12 | ++--------------+-----------------+------------------+--------------+ +``` + +## SIGN + +### Description + +Usage: Returns the sign of the argument as -1, 0, or 1, depending on whether the number is negative, zero, or positive +Argument type: INTEGER/LONG/FLOAT/DOUBLE +Return type: same type with input +Example + +```ppl +source=people +| eval `SIGN(1)` = SIGN(1), `SIGN(0)` = SIGN(0), `SIGN(-1.1)` = SIGN(-1.1) +| fields `SIGN(1)`, `SIGN(0)`, `SIGN(-1.1)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------+---------+------------+ +| SIGN(1) | SIGN(0) | SIGN(-1.1) | +|---------+---------+------------| +| 1 | 0 | -1.0 | ++---------+---------+------------+ +``` + +## SIGNUM + +### Description + +Usage: Returns the sign of the argument as -1, 0, or 1, depending on whether the number is negative, zero, or positive +Argument type: INTEGER/LONG/FLOAT/DOUBLE +Return type: INTEGER +Synonyms: `SIGN` +Example + +```ppl +source=people +| eval `SIGNUM(1)` = SIGNUM(1), `SIGNUM(0)` = SIGNUM(0), `SIGNUM(-1.1)` = SIGNUM(-1.1) +| fields `SIGNUM(1)`, `SIGNUM(0)`, `SIGNUM(-1.1)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------+-----------+--------------+ +| SIGNUM(1) | SIGNUM(0) | SIGNUM(-1.1) | +|-----------+-----------+--------------| +| 1 | 0 | -1.0 | ++-----------+-----------+--------------+ +``` + +## SIN + +### Description + +Usage: sin(x) calculates the sine of x, where x is given in radians. +Argument type: INTEGER/LONG/FLOAT/DOUBLE +Return type: DOUBLE +Example + +```ppl +source=people +| eval `SIN(0)` = SIN(0) +| fields `SIN(0)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------+ +| SIN(0) | +|--------| +| 0.0 | ++--------+ +``` + +## SINH + +### Description + +Usage: sinh(x) calculates the hyperbolic sine of x, defined as (((e^x) - (e^(-x))) / 2). +Argument type: INTEGER/LONG/FLOAT/DOUBLE +Return type: DOUBLE +Example + +```ppl +source=people +| eval `SINH(2)` = SINH(2) +| fields `SINH(2)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------+ +| SINH(2) | +|-------------------| +| 3.626860407847019 | ++-------------------+ +``` + +## SQRT + +### Description + +Usage: Calculates the square root of a non-negative number +Argument type: INTEGER/LONG/FLOAT/DOUBLE +Return type map: +(Non-negative) INTEGER/LONG/FLOAT/DOUBLE -> DOUBLE +(Negative) INTEGER/LONG/FLOAT/DOUBLE -> NULL +Example + +```ppl +source=people +| eval `SQRT(4)` = SQRT(4), `SQRT(4.41)` = SQRT(4.41) +| fields `SQRT(4)`, `SQRT(4.41)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------+------------+ +| SQRT(4) | SQRT(4.41) | +|---------+------------| +| 2.0 | 2.1 | ++---------+------------+ +``` + +## CBRT + +### Description + +Usage: Calculates the cube root of a number +Argument type: INTEGER/LONG/FLOAT/DOUBLE +Return type DOUBLE: +INTEGER/LONG/FLOAT/DOUBLE -> DOUBLE +Example + +```ppl ignore +source=location +| eval `CBRT(8)` = CBRT(8), `CBRT(9.261)` = CBRT(9.261), `CBRT(-27)` = CBRT(-27) +| fields `CBRT(8)`, `CBRT(9.261)`, `CBRT(-27)` +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++---------+-------------+-----------+ +| CBRT(8) | CBRT(9.261) | CBRT(-27) | +|---------+-------------+-----------| +| 2.0 | 2.1 | -3.0 | +| 2.0 | 2.1 | -3.0 | ++---------+-------------+-----------+ +``` + +## RINT + +### Description + +Usage: rint(NUMBER T) returns T rounded to the closest whole integer number. +Argument type: INTEGER/LONG/FLOAT/DOUBLE +Return type: DOUBLE +Example + +```ppl +source=people +| eval `RINT(1.7)` = RINT(1.7) +| fields `RINT(1.7)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------+ +| RINT(1.7) | +|-----------| +| 2.0 | ++-----------+ +``` + \ No newline at end of file diff --git a/docs/user/ppl/functions/math.rst b/docs/user/ppl/functions/math.rst deleted file mode 100644 index 20029c2c6ec..00000000000 --- a/docs/user/ppl/functions/math.rst +++ /dev/null @@ -1,1045 +0,0 @@ -====================== -Mathematical Functions -====================== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 1 - - -ABS ---- - -Description ->>>>>>>>>>> - -Usage: abs(x) calculates the abs x. - -Argument type: INTEGER/LONG/FLOAT/DOUBLE - -Return type: INTEGER/LONG/FLOAT/DOUBLE - -Example:: - - os> source=people | eval `ABS(-1)` = ABS(-1) | fields `ABS(-1)` - fetched rows / total rows = 1/1 - +---------+ - | ABS(-1) | - |---------| - | 1 | - +---------+ - - -ADD ---- - -Description ->>>>>>>>>>> - -Usage: add(x, y) calculates x plus y. - -Argument type: INTEGER/LONG/FLOAT/DOUBLE, INTEGER/LONG/FLOAT/DOUBLE - -Return type: Wider number between x and y - -Synonyms: Addition Symbol (+) - -Example:: - - os> source=people | eval `ADD(2, 1)` = ADD(2, 1) | fields `ADD(2, 1)` - fetched rows / total rows = 1/1 - +-----------+ - | ADD(2, 1) | - |-----------| - | 3 | - +-----------+ - - -SUBTRACT --------- - -Description ->>>>>>>>>>> - -Usage: subtract(x, y) calculates x minus y. - -Argument type: INTEGER/LONG/FLOAT/DOUBLE, INTEGER/LONG/FLOAT/DOUBLE - -Return type: Wider number between x and y - -Synonyms: Subtraction Symbol (-) - -Example:: - - os> source=people | eval `SUBTRACT(2, 1)` = SUBTRACT(2, 1) | fields `SUBTRACT(2, 1)` - fetched rows / total rows = 1/1 - +----------------+ - | SUBTRACT(2, 1) | - |----------------| - | 1 | - +----------------+ - - -MULTIPLY --------- - -Description ->>>>>>>>>>> - -Usage: multiply(x, y) calculates the multiplication of x and y. - -Argument type: INTEGER/LONG/FLOAT/DOUBLE, INTEGER/LONG/FLOAT/DOUBLE - -Return type: Wider number between x and y. If y equals to 0, then returns NULL. - -Synonyms: Multiplication Symbol (\*) - -Example:: - - os> source=people | eval `MULTIPLY(2, 1)` = MULTIPLY(2, 1) | fields `MULTIPLY(2, 1)` - fetched rows / total rows = 1/1 - +----------------+ - | MULTIPLY(2, 1) | - |----------------| - | 2 | - +----------------+ - - -DIVIDE ------- - -Description ->>>>>>>>>>> - -Usage: divide(x, y) calculates x divided by y. - -Argument type: INTEGER/LONG/FLOAT/DOUBLE, INTEGER/LONG/FLOAT/DOUBLE - -Return type: Wider number between x and y - -Synonyms: Division Symbol (/) - -Example:: - - os> source=people | eval `DIVIDE(2, 1)` = DIVIDE(2, 1) | fields `DIVIDE(2, 1)` - fetched rows / total rows = 1/1 - +--------------+ - | DIVIDE(2, 1) | - |--------------| - | 2 | - +--------------+ - - -SUM ---- - -Description ->>>>>>>>>>> - -Usage: sum(x, y, ...) calculates the sum of all provided arguments. This function accepts a variable number of arguments. - -Note: This function is only available in the eval command context and is rewritten to arithmetic addition while query parsing. - -Argument type: Variable number of INTEGER/LONG/FLOAT/DOUBLE arguments - -Return type: Wider number type among all arguments - -Example:: - - os> source=accounts | eval `SUM(1, 2, 3)` = SUM(1, 2, 3) | fields `SUM(1, 2, 3)` - fetched rows / total rows = 4/4 - +--------------+ - | SUM(1, 2, 3) | - |--------------| - | 6 | - | 6 | - | 6 | - | 6 | - +--------------+ - - os> source=accounts | eval total = SUM(age, 10, 5) | fields age, total - fetched rows / total rows = 4/4 - +-----+-------+ - | age | total | - |-----+-------| - | 32 | 47 | - | 36 | 51 | - | 28 | 43 | - | 33 | 48 | - +-----+-------+ - - -AVG ---- - -Description ->>>>>>>>>>> - -Usage: avg(x, y, ...) calculates the average (arithmetic mean) of all provided arguments. This function accepts a variable number of arguments. - -Note: This function is only available in the eval command context and is rewritten to arithmetic expression (sum / count) at query parsing time. - -Argument type: Variable number of INTEGER/LONG/FLOAT/DOUBLE arguments - -Return type: DOUBLE - -Example:: - - os> source=accounts | eval `AVG(1, 2, 3)` = AVG(1, 2, 3) | fields `AVG(1, 2, 3)` - fetched rows / total rows = 4/4 - +--------------+ - | AVG(1, 2, 3) | - |--------------| - | 2.0 | - | 2.0 | - | 2.0 | - | 2.0 | - +--------------+ - - os> source=accounts | eval average = AVG(age, 30) | fields age, average - fetched rows / total rows = 4/4 - +-----+---------+ - | age | average | - |-----+---------| - | 32 | 31.0 | - | 36 | 33.0 | - | 28 | 29.0 | - | 33 | 31.5 | - +-----+---------+ - - -ACOS ----- - -Description ->>>>>>>>>>> - -Usage: acos(x) calculates the arc cosine of x. Returns NULL if x is not in the range -1 to 1. - -Argument type: INTEGER/LONG/FLOAT/DOUBLE - -Return type: DOUBLE - -Example:: - - os> source=people | eval `ACOS(0)` = ACOS(0) | fields `ACOS(0)` - fetched rows / total rows = 1/1 - +--------------------+ - | ACOS(0) | - |--------------------| - | 1.5707963267948966 | - +--------------------+ - - -ASIN ----- - -Description ->>>>>>>>>>> - -Usage: asin(x) calculate the arc sine of x. Returns NULL if x is not in the range -1 to 1. - -Argument type: INTEGER/LONG/FLOAT/DOUBLE - -Return type: DOUBLE - -Example:: - - os> source=people | eval `ASIN(0)` = ASIN(0) | fields `ASIN(0)` - fetched rows / total rows = 1/1 - +---------+ - | ASIN(0) | - |---------| - | 0.0 | - +---------+ - - -ATAN ----- - -Description ->>>>>>>>>>> - -Usage: atan(x) calculates the arc tangent of x. atan(y, x) calculates the arc tangent of y / x, except that the signs of both arguments are used to determine the quadrant of the result. - -Argument type: INTEGER/LONG/FLOAT/DOUBLE - -Return type: DOUBLE - -Example:: - - os> source=people | eval `ATAN(2)` = ATAN(2), `ATAN(2, 3)` = ATAN(2, 3) | fields `ATAN(2)`, `ATAN(2, 3)` - fetched rows / total rows = 1/1 - +--------------------+--------------------+ - | ATAN(2) | ATAN(2, 3) | - |--------------------+--------------------| - | 1.1071487177940904 | 0.5880026035475675 | - +--------------------+--------------------+ - - -ATAN2 ------ - -Description ->>>>>>>>>>> - -Usage: atan2(y, x) calculates the arc tangent of y / x, except that the signs of both arguments are used to determine the quadrant of the result. - -Argument type: INTEGER/LONG/FLOAT/DOUBLE, INTEGER/LONG/FLOAT/DOUBLE - -Return type: DOUBLE - -Example:: - - os> source=people | eval `ATAN2(2, 3)` = ATAN2(2, 3) | fields `ATAN2(2, 3)` - fetched rows / total rows = 1/1 - +--------------------+ - | ATAN2(2, 3) | - |--------------------| - | 0.5880026035475675 | - +--------------------+ - - -CEIL ----- - -An alias for `CEILING`_ function. - - -CEILING -------- - -Description ->>>>>>>>>>> - -Usage: CEILING(T) takes the ceiling of value T. - -Note: `CEIL`_ and CEILING functions have the same implementation & functionality - -Limitation: CEILING only works as expected when IEEE 754 double type displays decimal when stored. - -Argument type: INTEGER/LONG/FLOAT/DOUBLE - -Return type: same type with input - -Example:: - - os> source=people | eval `CEILING(0)` = CEILING(0), `CEILING(50.00005)` = CEILING(50.00005), `CEILING(-50.00005)` = CEILING(-50.00005) | fields `CEILING(0)`, `CEILING(50.00005)`, `CEILING(-50.00005)` - fetched rows / total rows = 1/1 - +------------+-------------------+--------------------+ - | CEILING(0) | CEILING(50.00005) | CEILING(-50.00005) | - |------------+-------------------+--------------------| - | 0 | 51.0 | -50.0 | - +------------+-------------------+--------------------+ - - os> source=people | eval `CEILING(3147483647.12345)` = CEILING(3147483647.12345), `CEILING(113147483647.12345)` = CEILING(113147483647.12345), `CEILING(3147483647.00001)` = CEILING(3147483647.00001) | fields `CEILING(3147483647.12345)`, `CEILING(113147483647.12345)`, `CEILING(3147483647.00001)` - fetched rows / total rows = 1/1 - +---------------------------+-----------------------------+---------------------------+ - | CEILING(3147483647.12345) | CEILING(113147483647.12345) | CEILING(3147483647.00001) | - |---------------------------+-----------------------------+---------------------------| - | 3147483648.0 | 113147483648.0 | 3147483648.0 | - +---------------------------+-----------------------------+---------------------------+ - - -CONV ----- - -Description ->>>>>>>>>>> - -Usage: CONV(x, a, b) converts the number x from a base to b base. - -Argument type: x: STRING, a: INTEGER, b: INTEGER - -Return type: STRING - -Example:: - - os> source=people | eval `CONV('12', 10, 16)` = CONV('12', 10, 16), `CONV('2C', 16, 10)` = CONV('2C', 16, 10), `CONV(12, 10, 2)` = CONV(12, 10, 2), `CONV(1111, 2, 10)` = CONV(1111, 2, 10) | fields `CONV('12', 10, 16)`, `CONV('2C', 16, 10)`, `CONV(12, 10, 2)`, `CONV(1111, 2, 10)` - fetched rows / total rows = 1/1 - +--------------------+--------------------+-----------------+-------------------+ - | CONV('12', 10, 16) | CONV('2C', 16, 10) | CONV(12, 10, 2) | CONV(1111, 2, 10) | - |--------------------+--------------------+-----------------+-------------------| - | c | 44 | 1100 | 15 | - +--------------------+--------------------+-----------------+-------------------+ - - -COS ---- - -Description ->>>>>>>>>>> - -Usage: cos(x) calculates the cosine of x, where x is given in radians. - -Argument type: INTEGER/LONG/FLOAT/DOUBLE - -Return type: DOUBLE - -Example:: - - os> source=people | eval `COS(0)` = COS(0) | fields `COS(0)` - fetched rows / total rows = 1/1 - +--------+ - | COS(0) | - |--------| - | 1.0 | - +--------+ - - -COSH ----- - -Description ->>>>>>>>>>> - -Usage: cosh(x) calculates the hyperbolic cosine of x, defined as (((e^x) + (e^(-x))) / 2). - -Argument type: INTEGER/LONG/FLOAT/DOUBLE - -Return type: DOUBLE - -Example:: - - os> source=people | eval `COSH(2)` = COSH(2) | fields `COSH(2)` - fetched rows / total rows = 1/1 - +--------------------+ - | COSH(2) | - |--------------------| - | 3.7621956910836314 | - +--------------------+ - - -COT ---- - -Description ->>>>>>>>>>> - -Usage: cot(x) calculates the cotangent of x. Returns out-of-range error if x equals to 0. - -Argument type: INTEGER/LONG/FLOAT/DOUBLE - -Return type: DOUBLE - -Example:: - - os> source=people | eval `COT(1)` = COT(1) | fields `COT(1)` - fetched rows / total rows = 1/1 - +--------------------+ - | COT(1) | - |--------------------| - | 0.6420926159343306 | - +--------------------+ - - -CRC32 ------ - -Description ->>>>>>>>>>> - -Usage: Calculates a cyclic redundancy check value and returns a 32-bit unsigned value. - -Argument type: STRING - -Return type: LONG - -Example:: - - os> source=people | eval `CRC32('MySQL')` = CRC32('MySQL') | fields `CRC32('MySQL')` - fetched rows / total rows = 1/1 - +----------------+ - | CRC32('MySQL') | - |----------------| - | 3259397556 | - +----------------+ - - -DEGREES -------- - -Description ->>>>>>>>>>> - -Usage: degrees(x) converts x from radians to degrees. - -Argument type: INTEGER/LONG/FLOAT/DOUBLE - -Return type: DOUBLE - -Example:: - - os> source=people | eval `DEGREES(1.57)` = DEGREES(1.57) | fields `DEGREES(1.57)` - fetched rows / total rows = 1/1 - +-------------------+ - | DEGREES(1.57) | - |-------------------| - | 89.95437383553924 | - +-------------------+ - - -E -- - -Description ->>>>>>>>>>> - -Usage: E() returns the Euler's number - -Return type: DOUBLE - -Example:: - - os> source=people | eval `E()` = E() | fields `E()` - fetched rows / total rows = 1/1 - +-------------------+ - | E() | - |-------------------| - | 2.718281828459045 | - +-------------------+ - - -EXP ---- - -Description ->>>>>>>>>>> - -Usage: exp(x) return e raised to the power of x. - -Argument type: INTEGER/LONG/FLOAT/DOUBLE - -Return type: DOUBLE - -Example:: - - os> source=people | eval `EXP(2)` = EXP(2) | fields `EXP(2)` - fetched rows / total rows = 1/1 - +------------------+ - | EXP(2) | - |------------------| - | 7.38905609893065 | - +------------------+ - - -EXPM1 ------ - -Description ->>>>>>>>>>> - -Usage: expm1(NUMBER T) returns the exponential of T, minus 1. - -Argument type: INTEGER/LONG/FLOAT/DOUBLE - -Return type: DOUBLE - -Example:: - - os> source=people | eval `EXPM1(1)` = EXPM1(1) | fields `EXPM1(1)` - fetched rows / total rows = 1/1 - +-------------------+ - | EXPM1(1) | - |-------------------| - | 1.718281828459045 | - +-------------------+ - - -FLOOR ------ - -Description ->>>>>>>>>>> - -Usage: FLOOR(T) takes the floor of value T. - -Limitation: FLOOR only works as expected when IEEE 754 double type displays decimal when stored. - -Argument type: a: INTEGER/LONG/FLOAT/DOUBLE - -Return type: same type with input - -Example:: - - os> source=people | eval `FLOOR(0)` = FLOOR(0), `FLOOR(50.00005)` = FLOOR(50.00005), `FLOOR(-50.00005)` = FLOOR(-50.00005) | fields `FLOOR(0)`, `FLOOR(50.00005)`, `FLOOR(-50.00005)` - fetched rows / total rows = 1/1 - +----------+-----------------+------------------+ - | FLOOR(0) | FLOOR(50.00005) | FLOOR(-50.00005) | - |----------+-----------------+------------------| - | 0 | 50.0 | -51.0 | - +----------+-----------------+------------------+ - - os> source=people | eval `FLOOR(3147483647.12345)` = FLOOR(3147483647.12345), `FLOOR(113147483647.12345)` = FLOOR(113147483647.12345), `FLOOR(3147483647.00001)` = FLOOR(3147483647.00001) | fields `FLOOR(3147483647.12345)`, `FLOOR(113147483647.12345)`, `FLOOR(3147483647.00001)` - fetched rows / total rows = 1/1 - +-------------------------+---------------------------+-------------------------+ - | FLOOR(3147483647.12345) | FLOOR(113147483647.12345) | FLOOR(3147483647.00001) | - |-------------------------+---------------------------+-------------------------| - | 3147483647.0 | 113147483647.0 | 3147483647.0 | - +-------------------------+---------------------------+-------------------------+ - - os> source=people | eval `FLOOR(282474973688888.022)` = FLOOR(282474973688888.022), `FLOOR(9223372036854775807.022)` = FLOOR(9223372036854775807.022), `FLOOR(9223372036854775807.0000001)` = FLOOR(9223372036854775807.0000001) | fields `FLOOR(282474973688888.022)`, `FLOOR(9223372036854775807.022)`, `FLOOR(9223372036854775807.0000001)` - fetched rows / total rows = 1/1 - +----------------------------+--------------------------------+------------------------------------+ - | FLOOR(282474973688888.022) | FLOOR(9223372036854775807.022) | FLOOR(9223372036854775807.0000001) | - |----------------------------+--------------------------------+------------------------------------| - | 282474973688888.0 | 9.223372036854776e+18 | 9.223372036854776e+18 | - +----------------------------+--------------------------------+------------------------------------+ - - -LN --- - -Description ->>>>>>>>>>> - -Usage: ln(x) return the the natural logarithm of x. - -Argument type: INTEGER/LONG/FLOAT/DOUBLE - -Return type: DOUBLE - -Example:: - - os> source=people | eval `LN(2)` = LN(2) | fields `LN(2)` - fetched rows / total rows = 1/1 - +--------------------+ - | LN(2) | - |--------------------| - | 0.6931471805599453 | - +--------------------+ - - -LOG ---- - -Description ->>>>>>>>>>> - -Specifications: - -Usage: log(x) returns the natural logarithm of x that is the base e logarithm of the x. log(B, x) is equivalent to log(x)/log(B). - -Argument type: INTEGER/LONG/FLOAT/DOUBLE - -Return type: DOUBLE - -Example:: - - os> source=people | eval `LOG(2)` = LOG(2), `LOG(2, 8)` = LOG(2, 8) | fields `LOG(2)`, `LOG(2, 8)` - fetched rows / total rows = 1/1 - +--------------------+-----------+ - | LOG(2) | LOG(2, 8) | - |--------------------+-----------| - | 0.6931471805599453 | 3.0 | - +--------------------+-----------+ - - -LOG2 ----- - -Description ->>>>>>>>>>> - -Specifications: - -Usage: log2(x) is equivalent to log(x)/log(2). - -Argument type: INTEGER/LONG/FLOAT/DOUBLE - -Return type: DOUBLE - -Example:: - - os> source=people | eval `LOG2(8)` = LOG2(8) | fields `LOG2(8)` - fetched rows / total rows = 1/1 - +---------+ - | LOG2(8) | - |---------| - | 3.0 | - +---------+ - - -LOG10 ------ - -Description ->>>>>>>>>>> - -Specifications: - -Usage: log10(x) is equivalent to log(x)/log(10). - -Argument type: INTEGER/LONG/FLOAT/DOUBLE - -Return type: DOUBLE - -Example:: - - os> source=people | eval `LOG10(100)` = LOG10(100) | fields `LOG10(100)` - fetched rows / total rows = 1/1 - +------------+ - | LOG10(100) | - |------------| - | 2.0 | - +------------+ - - -MOD ---- - -Description ->>>>>>>>>>> - -Usage: MOD(n, m) calculates the remainder of the number n divided by m. - -Argument type: INTEGER/LONG/FLOAT/DOUBLE, INTEGER/LONG/FLOAT/DOUBLE - -Return type: Wider type between types of n and m if m is nonzero value. If m equals to 0, then returns NULL. - -Example:: - - os> source=people | eval `MOD(3, 2)` = MOD(3, 2), `MOD(3.1, 2)` = MOD(3.1, 2) | fields `MOD(3, 2)`, `MOD(3.1, 2)` - fetched rows / total rows = 1/1 - +-----------+-------------+ - | MOD(3, 2) | MOD(3.1, 2) | - |-----------+-------------| - | 1 | 1.1 | - +-----------+-------------+ - - -MODULUS -------- - -Description ->>>>>>>>>>> - -Usage: MODULUS(n, m) calculates the remainder of the number n divided by m. - -Argument type: INTEGER/LONG/FLOAT/DOUBLE, INTEGER/LONG/FLOAT/DOUBLE - -Return type: Wider type between types of n and m if m is nonzero value. If m equals to 0, then returns NULL. - -Example:: - - os> source=people | eval `MODULUS(3, 2)` = MODULUS(3, 2), `MODULUS(3.1, 2)` = MODULUS(3.1, 2) | fields `MODULUS(3, 2)`, `MODULUS(3.1, 2)` - fetched rows / total rows = 1/1 - +---------------+-----------------+ - | MODULUS(3, 2) | MODULUS(3.1, 2) | - |---------------+-----------------| - | 1 | 1.1 | - +---------------+-----------------+ - - -PI --- - -Description ->>>>>>>>>>> - -Usage: PI() returns the constant pi - -Return type: DOUBLE - -Example:: - - os> source=people | eval `PI()` = PI() | fields `PI()` - fetched rows / total rows = 1/1 - +-------------------+ - | PI() | - |-------------------| - | 3.141592653589793 | - +-------------------+ - - -POW ---- - -Description ->>>>>>>>>>> - -Usage: POW(x, y) calculates the value of x raised to the power of y. Bad inputs return NULL result. - -Argument type: INTEGER/LONG/FLOAT/DOUBLE, INTEGER/LONG/FLOAT/DOUBLE - -Return type: DOUBLE - -Synonyms: `POWER`_ - -Example:: - - os> source=people | eval `POW(3, 2)` = POW(3, 2), `POW(-3, 2)` = POW(-3, 2), `POW(3, -2)` = POW(3, -2) | fields `POW(3, 2)`, `POW(-3, 2)`, `POW(3, -2)` - fetched rows / total rows = 1/1 - +-----------+------------+--------------------+ - | POW(3, 2) | POW(-3, 2) | POW(3, -2) | - |-----------+------------+--------------------| - | 9.0 | 9.0 | 0.1111111111111111 | - +-----------+------------+--------------------+ - - -POWER ------ - -Description ->>>>>>>>>>> - -Usage: POWER(x, y) calculates the value of x raised to the power of y. Bad inputs return NULL result. - -Argument type: INTEGER/LONG/FLOAT/DOUBLE, INTEGER/LONG/FLOAT/DOUBLE - -Return type: DOUBLE - -Synonyms: `POW`_ - -Example:: - - os> source=people | eval `POWER(3, 2)` = POWER(3, 2), `POWER(-3, 2)` = POWER(-3, 2), `POWER(3, -2)` = POWER(3, -2) | fields `POWER(3, 2)`, `POWER(-3, 2)`, `POWER(3, -2)` - fetched rows / total rows = 1/1 - +-------------+--------------+--------------------+ - | POWER(3, 2) | POWER(-3, 2) | POWER(3, -2) | - |-------------+--------------+--------------------| - | 9.0 | 9.0 | 0.1111111111111111 | - +-------------+--------------+--------------------+ - - -RADIANS -------- - -Description ->>>>>>>>>>> - -Usage: radians(x) converts x from degrees to radians. - -Argument type: INTEGER/LONG/FLOAT/DOUBLE - -Return type: DOUBLE - -Example:: - - os> source=people | eval `RADIANS(90)` = RADIANS(90) | fields `RADIANS(90)` - fetched rows / total rows = 1/1 - +--------------------+ - | RADIANS(90) | - |--------------------| - | 1.5707963267948966 | - +--------------------+ - - -RAND ----- - -Description ->>>>>>>>>>> - -Usage: RAND()/RAND(N) returns a random floating-point value in the range 0 <= value < 1.0. If integer N is specified, the seed is initialized prior to execution. One implication of this behavior is with identical argument N, rand(N) returns the same value each time, and thus produces a repeatable sequence of column values. - -Argument type: INTEGER - -Return type: FLOAT - -Example:: - - os> source=people | eval `RAND(3)` = RAND(3) | fields `RAND(3)` - fetched rows / total rows = 1/1 - +---------------------+ - | RAND(3) | - |---------------------| - | 0.34346429521113886 | - +---------------------+ - - -ROUND ------ - -Description ->>>>>>>>>>> - -Usage: ROUND(x, d) rounds the argument x to d decimal places, d defaults to 0 if not specified - -Argument type: INTEGER/LONG/FLOAT/DOUBLE - -Return type map: - -(INTEGER/LONG [,INTEGER]) -> LONG -(FLOAT/DOUBLE [,INTEGER]) -> LONG - -Example:: - - os> source=people | eval `ROUND(12.34)` = ROUND(12.34), `ROUND(12.34, 1)` = ROUND(12.34, 1), `ROUND(12.34, -1)` = ROUND(12.34, -1), `ROUND(12, 1)` = ROUND(12, 1) | fields `ROUND(12.34)`, `ROUND(12.34, 1)`, `ROUND(12.34, -1)`, `ROUND(12, 1)` - fetched rows / total rows = 1/1 - +--------------+-----------------+------------------+--------------+ - | ROUND(12.34) | ROUND(12.34, 1) | ROUND(12.34, -1) | ROUND(12, 1) | - |--------------+-----------------+------------------+--------------| - | 12.0 | 12.3 | 10.0 | 12 | - +--------------+-----------------+------------------+--------------+ - - -SIGN ----- - -Description ->>>>>>>>>>> - -Usage: Returns the sign of the argument as -1, 0, or 1, depending on whether the number is negative, zero, or positive - -Argument type: INTEGER/LONG/FLOAT/DOUBLE - -Return type: same type with input - -Example:: - - os> source=people | eval `SIGN(1)` = SIGN(1), `SIGN(0)` = SIGN(0), `SIGN(-1.1)` = SIGN(-1.1) | fields `SIGN(1)`, `SIGN(0)`, `SIGN(-1.1)` - fetched rows / total rows = 1/1 - +---------+---------+------------+ - | SIGN(1) | SIGN(0) | SIGN(-1.1) | - |---------+---------+------------| - | 1 | 0 | -1.0 | - +---------+---------+------------+ - - -SIGNUM ------- - -Description ->>>>>>>>>>> - -Usage: Returns the sign of the argument as -1, 0, or 1, depending on whether the number is negative, zero, or positive - -Argument type: INTEGER/LONG/FLOAT/DOUBLE - -Return type: INTEGER - -Synonyms: `SIGN` - -Example:: - - os> source=people | eval `SIGNUM(1)` = SIGNUM(1), `SIGNUM(0)` = SIGNUM(0), `SIGNUM(-1.1)` = SIGNUM(-1.1) | fields `SIGNUM(1)`, `SIGNUM(0)`, `SIGNUM(-1.1)` - fetched rows / total rows = 1/1 - +-----------+-----------+--------------+ - | SIGNUM(1) | SIGNUM(0) | SIGNUM(-1.1) | - |-----------+-----------+--------------| - | 1 | 0 | -1.0 | - +-----------+-----------+--------------+ - - -SIN ---- - -Description ->>>>>>>>>>> - -Usage: sin(x) calculates the sine of x, where x is given in radians. - -Argument type: INTEGER/LONG/FLOAT/DOUBLE - -Return type: DOUBLE - -Example:: - - os> source=people | eval `SIN(0)` = SIN(0) | fields `SIN(0)` - fetched rows / total rows = 1/1 - +--------+ - | SIN(0) | - |--------| - | 0.0 | - +--------+ - - -SINH ----- - -Description ->>>>>>>>>>> - -Usage: sinh(x) calculates the hyperbolic sine of x, defined as (((e^x) - (e^(-x))) / 2). - -Argument type: INTEGER/LONG/FLOAT/DOUBLE - -Return type: DOUBLE - -Example:: - - os> source=people | eval `SINH(2)` = SINH(2) | fields `SINH(2)` - fetched rows / total rows = 1/1 - +-------------------+ - | SINH(2) | - |-------------------| - | 3.626860407847019 | - +-------------------+ - - -SQRT ----- - -Description ->>>>>>>>>>> - -Usage: Calculates the square root of a non-negative number - -Argument type: INTEGER/LONG/FLOAT/DOUBLE - -Return type map: - -(Non-negative) INTEGER/LONG/FLOAT/DOUBLE -> DOUBLE -(Negative) INTEGER/LONG/FLOAT/DOUBLE -> NULL - -Example:: - - os> source=people | eval `SQRT(4)` = SQRT(4), `SQRT(4.41)` = SQRT(4.41) | fields `SQRT(4)`, `SQRT(4.41)` - fetched rows / total rows = 1/1 - +---------+------------+ - | SQRT(4) | SQRT(4.41) | - |---------+------------| - | 2.0 | 2.1 | - +---------+------------+ - - -CBRT ----- - -Description ->>>>>>>>>>> - -Usage: Calculates the cube root of a number - -Argument type: INTEGER/LONG/FLOAT/DOUBLE - -Return type DOUBLE: - -INTEGER/LONG/FLOAT/DOUBLE -> DOUBLE - -Example:: - - opensearchsql> source=location | eval `CBRT(8)` = CBRT(8), `CBRT(9.261)` = CBRT(9.261), `CBRT(-27)` = CBRT(-27) | fields `CBRT(8)`, `CBRT(9.261)`, `CBRT(-27)`; - fetched rows / total rows = 2/2 - +---------+-------------+-----------+ - | CBRT(8) | CBRT(9.261) | CBRT(-27) | - |---------+-------------+-----------| - | 2.0 | 2.1 | -3.0 | - | 2.0 | 2.1 | -3.0 | - +---------+-------------+-----------+ - - -RINT ----- - -Description ->>>>>>>>>>> - -Usage: rint(NUMBER T) returns T rounded to the closest whole integer number. - -Argument type: INTEGER/LONG/FLOAT/DOUBLE - -Return type: DOUBLE - -Example:: - - os> source=people | eval `RINT(1.7)` = RINT(1.7) | fields `RINT(1.7)` - fetched rows / total rows = 1/1 - +-----------+ - | RINT(1.7) | - |-----------| - | 2.0 | - +-----------+ \ No newline at end of file diff --git a/docs/user/ppl/functions/relevance.md b/docs/user/ppl/functions/relevance.md new file mode 100644 index 00000000000..2db7caf9a2c --- /dev/null +++ b/docs/user/ppl/functions/relevance.md @@ -0,0 +1,505 @@ +# Relevance Functions + +The relevance based functions enable users to search the index for documents by the relevance of the input query. The functions are built on the top of the search queries of the OpenSearch engine, but in memory execution within the plugin is not supported. These functions are able to perform the global filter of a query, for example the condition expression in a `WHERE` clause or in a `HAVING` clause. For more details of the relevance based search, check out the design here: [Relevance Based Search With SQL/PPL Query Engine](https://github.com/opensearch-project/sql/issues/182) +## MATCH + +### Description + +`match(field_expression, query_expression[, option=]*)` +The match function maps to the match query used in search engine, to return the documents that match a provided text, number, date or boolean value with a given field. Available parameters include: +- analyzer +- auto_generate_synonyms_phrase +- fuzziness +- max_expansions +- prefix_length +- fuzzy_transpositions +- fuzzy_rewrite +- lenient +- operator +- minimum_should_match +- zero_terms_query +- boost + +Example with only `field` and `query` expressions, and all other parameters are set default values + +```ppl +source=accounts +| where match(address, 'Street') +| fields lastname, address +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++----------+--------------------+ +| lastname | address | +|----------+--------------------| +| Bond | 671 Bristol Street | +| Bates | 789 Madison Street | ++----------+--------------------+ +``` + +Another example to show how to set custom values for the optional parameters + +```ppl +source=accounts +| where match(firstname, 'Hattie', operator='AND', boost=2.0) +| fields lastname +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------+ +| lastname | +|----------| +| Bond | ++----------+ +``` + +## MATCH_PHRASE + +### Description + +`match_phrase(field_expression, query_expression[, option=]*)` +The match_phrase function maps to the match_phrase query used in search engine, to return the documents that match a provided text with a given field. Available parameters include: +- analyzer +- slop +- zero_terms_query + +For backward compatibility, matchphrase is also supported and mapped to match_phrase query as well. +Example with only `field` and `query` expressions, and all other parameters are set default values + +```ppl +source=books +| where match_phrase(author, 'Alexander Milne') +| fields author, title +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++----------------------+--------------------------+ +| author | title | +|----------------------+--------------------------| +| Alan Alexander Milne | The House at Pooh Corner | +| Alan Alexander Milne | Winnie-the-Pooh | ++----------------------+--------------------------+ +``` + +Another example to show how to set custom values for the optional parameters + +```ppl +source=books +| where match_phrase(author, 'Alan Milne', slop = 2) +| fields author, title +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++----------------------+--------------------------+ +| author | title | +|----------------------+--------------------------| +| Alan Alexander Milne | The House at Pooh Corner | +| Alan Alexander Milne | Winnie-the-Pooh | ++----------------------+--------------------------+ +``` + +## MATCH_PHRASE_PREFIX + +### Description + +`match_phrase_prefix(field_expression, query_expression[, option=]*)` +The match_phrase_prefix function maps to the match_phrase_prefix query used in search engine, to return the documents that match a provided text with a given field. Available parameters include: +- analyzer +- slop +- max_expansions +- boost +- zero_terms_query + +Example with only `field` and `query` expressions, and all other parameters are set default values + +```ppl +source=books +| where match_phrase_prefix(author, 'Alexander Mil') +| fields author, title +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++----------------------+--------------------------+ +| author | title | +|----------------------+--------------------------| +| Alan Alexander Milne | The House at Pooh Corner | +| Alan Alexander Milne | Winnie-the-Pooh | ++----------------------+--------------------------+ +``` + +Another example to show how to set custom values for the optional parameters + +```ppl +source=books +| where match_phrase_prefix(author, 'Alan Mil', slop = 2) +| fields author, title +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++----------------------+--------------------------+ +| author | title | +|----------------------+--------------------------| +| Alan Alexander Milne | The House at Pooh Corner | +| Alan Alexander Milne | Winnie-the-Pooh | ++----------------------+--------------------------+ +``` + +## MULTI_MATCH + +### Description + +`multi_match([field_expression+], query_expression[, option=]*)` +`multi_match(query_expression[, option=]*)` +The multi_match function maps to the multi_match query used in search engine, to return the documents that match a provided text, number, date or boolean value with a given field or fields. +**Two syntax forms are supported:** +1. **With explicit fields** (classic syntax): `multi_match([field_list], query, ...)` +2. **Without fields** (search default fields): `multi_match(query, ...)` + +When fields are omitted, the query searches in the fields specified by the `index.query.default_field` setting. +The **^** lets you *boost* certain fields. Boosts are multipliers that weigh matches in one field more heavily than matches in other fields. The syntax allows to specify the fields in double quotes, single quotes, in backtick or even without any wrap. All fields search using star `"*"` is also available (star symbol should be wrapped). The weight is optional and should be specified using after the field name, it could be delimeted by the `caret` character or by whitespace. Please, refer to examples below: +``multi_match(["Tags" ^ 2, 'Title' 3.4, `Body`, Comments ^ 0.3], ...)`` +`multi_match(["*"], ...)` +`multi_match("search text", ...)` (searches default fields) +Available parameters include: +- analyzer +- auto_generate_synonyms_phrase +- cutoff_frequency +- fuzziness +- fuzzy_transpositions +- lenient +- max_expansions +- minimum_should_match +- operator +- prefix_length +- tie_breaker +- type +- slop +- boost + +Example with only `fields` and `query` expressions, and all other parameters are set default values + +```ppl +source=books +| where multi_match(['title'], 'Pooh House') +| fields id, title, author +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++----+--------------------------+----------------------+ +| id | title | author | +|----+--------------------------+----------------------| +| 1 | The House at Pooh Corner | Alan Alexander Milne | +| 2 | Winnie-the-Pooh | Alan Alexander Milne | ++----+--------------------------+----------------------+ +``` + +Another example to show how to set custom values for the optional parameters + +```ppl +source=books +| where multi_match(['title'], 'Pooh House', operator='AND', analyzer=default) +| fields id, title, author +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----+--------------------------+----------------------+ +| id | title | author | +|----+--------------------------+----------------------| +| 1 | The House at Pooh Corner | Alan Alexander Milne | ++----+--------------------------+----------------------+ +``` + +Example using the new syntax without specifying fields (searches in index.query.default_field) + +```ppl +source=books +| where multi_match('Pooh House') +| fields id, title, author +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++----+--------------------------+----------------------+ +| id | title | author | +|----+--------------------------+----------------------| +| 1 | The House at Pooh Corner | Alan Alexander Milne | +| 2 | Winnie-the-Pooh | Alan Alexander Milne | ++----+--------------------------+----------------------+ +``` + +## SIMPLE_QUERY_STRING + +### Description + +`simple_query_string([field_expression+], query_expression[, option=]*)` +`simple_query_string(query_expression[, option=]*)` +The simple_query_string function maps to the simple_query_string query used in search engine, to return the documents that match a provided text, number, date or boolean value with a given field or fields. +**Two syntax forms are supported:** +1. **With explicit fields** (classic syntax): `simple_query_string([field_list], query, ...)` +2. **Without fields** (search default fields): `simple_query_string(query, ...)` + +When fields are omitted, the query searches in the fields specified by the `index.query.default_field` setting. +The **^** lets you *boost* certain fields. Boosts are multipliers that weigh matches in one field more heavily than matches in other fields. The syntax allows to specify the fields in double quotes, single quotes, in backtick or even without any wrap. All fields search using star `"*"` is also available (star symbol should be wrapped). The weight is optional and should be specified using after the field name, it could be delimeted by the `caret` character or by whitespace. Please, refer to examples below: +``simple_query_string(["Tags" ^ 2, 'Title' 3.4, `Body`, Comments ^ 0.3], ...)`` +`simple_query_string(["*"], ...)` +`simple_query_string("search text", ...)` (searches default fields) +Available parameters include: +- analyze_wildcard +- analyzer +- auto_generate_synonyms_phrase +- flags +- fuzziness +- fuzzy_max_expansions +- fuzzy_prefix_length +- fuzzy_transpositions +- lenient +- default_operator +- minimum_should_match +- quote_field_suffix +- boost + +Example with only `fields` and `query` expressions, and all other parameters are set default values + +```ppl +source=books +| where simple_query_string(['title'], 'Pooh House') +| fields id, title, author +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++----+--------------------------+----------------------+ +| id | title | author | +|----+--------------------------+----------------------| +| 1 | The House at Pooh Corner | Alan Alexander Milne | +| 2 | Winnie-the-Pooh | Alan Alexander Milne | ++----+--------------------------+----------------------+ +``` + +Another example to show how to set custom values for the optional parameters + +```ppl +source=books +| where simple_query_string(['title'], 'Pooh House', flags='ALL', default_operator='AND') +| fields id, title, author +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----+--------------------------+----------------------+ +| id | title | author | +|----+--------------------------+----------------------| +| 1 | The House at Pooh Corner | Alan Alexander Milne | ++----+--------------------------+----------------------+ +``` + +Example using the new syntax without specifying fields (searches in index.query.default_field) + +```ppl +source=books +| where simple_query_string('Pooh House') +| fields id, title, author +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++----+--------------------------+----------------------+ +| id | title | author | +|----+--------------------------+----------------------| +| 1 | The House at Pooh Corner | Alan Alexander Milne | +| 2 | Winnie-the-Pooh | Alan Alexander Milne | ++----+--------------------------+----------------------+ +``` + +## MATCH_BOOL_PREFIX + +### Description + +`match_bool_prefix(field_expression, query_expression)` +The match_bool_prefix function maps to the match_bool_prefix query in the search engine. match_bool_prefix creates a match query from all but the last term in the query string. The last term is used to create a prefix query. +- analyzer +- fuzziness +- max_expansions +- prefix_length +- fuzzy_transpositions +- operator +- fuzzy_rewrite +- minimum_should_match +- boost + +Example with only `field` and `query` expressions, and all other parameters are set default values + +```ppl +source=accounts +| where match_bool_prefix(address, 'Bristol Stre') +| fields firstname, address +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++-----------+--------------------+ +| firstname | address | +|-----------+--------------------| +| Hattie | 671 Bristol Street | +| Nanette | 789 Madison Street | ++-----------+--------------------+ +``` + +Another example to show how to set custom values for the optional parameters + +```ppl +source=accounts +| where match_bool_prefix(address, 'Bristol Stre', minimum_should_match = 2) +| fields firstname, address +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------+--------------------+ +| firstname | address | +|-----------+--------------------| +| Hattie | 671 Bristol Street | ++-----------+--------------------+ +``` + +## QUERY_STRING + +### Description + +`query_string([field_expression+], query_expression[, option=]*)` +`query_string(query_expression[, option=]*)` +The query_string function maps to the query_string query used in search engine, to return the documents that match a provided text, number, date or boolean value with a given field or fields. +**Two syntax forms are supported:** +1. **With explicit fields** (classic syntax): `query_string([field_list], query, ...)` +2. **Without fields** (search default fields): `query_string(query, ...)` + +When fields are omitted, the query searches in the fields specified by the `index.query.default_field` setting. +The **^** lets you *boost* certain fields. Boosts are multipliers that weigh matches in one field more heavily than matches in other fields. The syntax allows to specify the fields in double quotes, +single quotes, in backtick or even without any wrap. All fields search using star `"*"` is also available (star symbol should be wrapped). The weight is optional and should be specified using after the field name, +it could be delimeted by the `caret` character or by whitespace. Please, refer to examples below: +``query_string(["Tags" ^ 2, 'Title' 3.4, `Body`, Comments ^ 0.3], ...)`` +`query_string(["*"], ...)` +`query_string("search text", ...)` (searches default fields) +Available parameters include: +- analyzer +- escape +- allow_leading_wildcard +- analyze_wildcard +- auto_generate_synonyms_phrase_query +- boost +- default_operator +- enable_position_increments +- fuzziness +- fuzzy_max_expansions +- fuzzy_prefix_length +- fuzzy_transpositions +- fuzzy_rewrite +- tie_breaker +- lenient +- type +- max_determinized_states +- minimum_should_match +- quote_analyzer +- phrase_slop +- quote_field_suffix +- rewrite +- time_zone + +Example with only `fields` and `query` expressions, and all other parameters are set default values + +```ppl +source=books +| where query_string(['title'], 'Pooh House') +| fields id, title, author +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++----+--------------------------+----------------------+ +| id | title | author | +|----+--------------------------+----------------------| +| 1 | The House at Pooh Corner | Alan Alexander Milne | +| 2 | Winnie-the-Pooh | Alan Alexander Milne | ++----+--------------------------+----------------------+ +``` + +Another example to show how to set custom values for the optional parameters + +```ppl +source=books +| where query_string(['title'], 'Pooh House', default_operator='AND') +| fields id, title, author +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----+--------------------------+----------------------+ +| id | title | author | +|----+--------------------------+----------------------| +| 1 | The House at Pooh Corner | Alan Alexander Milne | ++----+--------------------------+----------------------+ +``` + +Example using the new syntax without specifying fields (searches in index.query.default_field) + +```ppl +source=books +| where query_string('Pooh House') +| fields id, title, author +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++----+--------------------------+----------------------+ +| id | title | author | +|----+--------------------------+----------------------| +| 1 | The House at Pooh Corner | Alan Alexander Milne | +| 2 | Winnie-the-Pooh | Alan Alexander Milne | ++----+--------------------------+----------------------+ +``` + +### Limitations + +The relevance functions are available to execute only in OpenSearch DSL but not in memory as of now, so the relevance search might fail for queries that are too complex to translate into DSL if the relevance function is following after a complex PPL query. To make your queries always work-able, it is recommended to place the relevance commands as close to the search command as possible, to ensure the relevance functions are eligible to push down. For example, a complex query like `search source = people | rename firstname as name | dedup account_number | fields name, account_number, balance, employer | where match(employer, 'Open Search') | stats count() by city` could fail because it is difficult to translate to DSL, but it would be better if we rewrite it to an equivalent query as `search source = people | where match(employer, 'Open Search') | rename firstname as name | dedup account_number | fields name, account_number, balance, employer | stats count() by city` by moving the where command with relevance function to the second command right after the search command, and the relevance would be optimized and executed smoothly in OpenSearch DSL. See [Optimization](../../optimization/optimization.md) to get more details about the query engine optimization. \ No newline at end of file diff --git a/docs/user/ppl/functions/relevance.rst b/docs/user/ppl/functions/relevance.rst deleted file mode 100644 index 3f30586c730..00000000000 --- a/docs/user/ppl/functions/relevance.rst +++ /dev/null @@ -1,424 +0,0 @@ -=================== -Relevance Functions -=================== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 1 - -The relevance based functions enable users to search the index for documents by the relevance of the input query. The functions are built on the top of the search queries of the OpenSearch engine, but in memory execution within the plugin is not supported. These functions are able to perform the global filter of a query, for example the condition expression in a ``WHERE`` clause or in a ``HAVING`` clause. For more details of the relevance based search, check out the design here: `Relevance Based Search With SQL/PPL Query Engine `_ - -MATCH ------ - -Description ->>>>>>>>>>> - -``match(field_expression, query_expression[, option=]*)`` - -The match function maps to the match query used in search engine, to return the documents that match a provided text, number, date or boolean value with a given field. Available parameters include: - -- analyzer -- auto_generate_synonyms_phrase -- fuzziness -- max_expansions -- prefix_length -- fuzzy_transpositions -- fuzzy_rewrite -- lenient -- operator -- minimum_should_match -- zero_terms_query -- boost - -Example with only ``field`` and ``query`` expressions, and all other parameters are set default values:: - - os> source=accounts | where match(address, 'Street') | fields lastname, address; - fetched rows / total rows = 2/2 - +----------+--------------------+ - | lastname | address | - |----------+--------------------| - | Bond | 671 Bristol Street | - | Bates | 789 Madison Street | - +----------+--------------------+ - - - -Another example to show how to set custom values for the optional parameters:: - - os> source=accounts | where match(firstname, 'Hattie', operator='AND', boost=2.0) | fields lastname; - fetched rows / total rows = 1/1 - +----------+ - | lastname | - |----------| - | Bond | - +----------+ - - -MATCH_PHRASE ------------- - -Description ->>>>>>>>>>> - -``match_phrase(field_expression, query_expression[, option=]*)`` - -The match_phrase function maps to the match_phrase query used in search engine, to return the documents that match a provided text with a given field. Available parameters include: - -- analyzer -- slop -- zero_terms_query - -For backward compatibility, matchphrase is also supported and mapped to match_phrase query as well. - -Example with only ``field`` and ``query`` expressions, and all other parameters are set default values:: - - os> source=books | where match_phrase(author, 'Alexander Milne') | fields author, title - fetched rows / total rows = 2/2 - +----------------------+--------------------------+ - | author | title | - |----------------------+--------------------------| - | Alan Alexander Milne | The House at Pooh Corner | - | Alan Alexander Milne | Winnie-the-Pooh | - +----------------------+--------------------------+ - - - -Another example to show how to set custom values for the optional parameters:: - - os> source=books | where match_phrase(author, 'Alan Milne', slop = 2) | fields author, title - fetched rows / total rows = 2/2 - +----------------------+--------------------------+ - | author | title | - |----------------------+--------------------------| - | Alan Alexander Milne | The House at Pooh Corner | - | Alan Alexander Milne | Winnie-the-Pooh | - +----------------------+--------------------------+ - - -MATCH_PHRASE_PREFIX -------------------- - -Description ->>>>>>>>>>> - -``match_phrase_prefix(field_expression, query_expression[, option=]*)`` - -The match_phrase_prefix function maps to the match_phrase_prefix query used in search engine, to return the documents that match a provided text with a given field. Available parameters include: - -- analyzer -- slop -- max_expansions -- boost -- zero_terms_query - -Example with only ``field`` and ``query`` expressions, and all other parameters are set default values:: - - os> source=books | where match_phrase_prefix(author, 'Alexander Mil') | fields author, title - fetched rows / total rows = 2/2 - +----------------------+--------------------------+ - | author | title | - |----------------------+--------------------------| - | Alan Alexander Milne | The House at Pooh Corner | - | Alan Alexander Milne | Winnie-the-Pooh | - +----------------------+--------------------------+ - - - -Another example to show how to set custom values for the optional parameters:: - - os> source=books | where match_phrase_prefix(author, 'Alan Mil', slop = 2) | fields author, title - fetched rows / total rows = 2/2 - +----------------------+--------------------------+ - | author | title | - |----------------------+--------------------------| - | Alan Alexander Milne | The House at Pooh Corner | - | Alan Alexander Milne | Winnie-the-Pooh | - +----------------------+--------------------------+ - - -MULTI_MATCH ------------ - -Description ->>>>>>>>>>> - -``multi_match([field_expression+], query_expression[, option=]*)`` - -``multi_match(query_expression[, option=]*)`` - -The multi_match function maps to the multi_match query used in search engine, to return the documents that match a provided text, number, date or boolean value with a given field or fields. - -**Two syntax forms are supported:** - -1. **With explicit fields** (classic syntax): ``multi_match([field_list], query, ...)`` -2. **Without fields** (search default fields): ``multi_match(query, ...)`` - -When fields are omitted, the query searches in the fields specified by the ``index.query.default_field`` setting. - -The **^** lets you *boost* certain fields. Boosts are multipliers that weigh matches in one field more heavily than matches in other fields. The syntax allows to specify the fields in double quotes, single quotes, in backtick or even without any wrap. All fields search using star ``"*"`` is also available (star symbol should be wrapped). The weight is optional and should be specified using after the field name, it could be delimeted by the `caret` character or by whitespace. Please, refer to examples below: - -| ``multi_match(["Tags" ^ 2, 'Title' 3.4, `Body`, Comments ^ 0.3], ...)`` -| ``multi_match(["*"], ...)`` -| ``multi_match("search text", ...)`` (searches default fields) - - -Available parameters include: - -- analyzer -- auto_generate_synonyms_phrase -- cutoff_frequency -- fuzziness -- fuzzy_transpositions -- lenient -- max_expansions -- minimum_should_match -- operator -- prefix_length -- tie_breaker -- type -- slop -- boost - -Example with only ``fields`` and ``query`` expressions, and all other parameters are set default values:: - - os> source=books | where multi_match(['title'], 'Pooh House') | fields id, title, author; - fetched rows / total rows = 2/2 - +----+--------------------------+----------------------+ - | id | title | author | - |----+--------------------------+----------------------| - | 1 | The House at Pooh Corner | Alan Alexander Milne | - | 2 | Winnie-the-Pooh | Alan Alexander Milne | - +----+--------------------------+----------------------+ - -Another example to show how to set custom values for the optional parameters:: - - os> source=books | where multi_match(['title'], 'Pooh House', operator='AND', analyzer=default) | fields id, title, author; - fetched rows / total rows = 1/1 - +----+--------------------------+----------------------+ - | id | title | author | - |----+--------------------------+----------------------| - | 1 | The House at Pooh Corner | Alan Alexander Milne | - +----+--------------------------+----------------------+ - -Example using the new syntax without specifying fields (searches in index.query.default_field):: - - os> source=books | where multi_match('Pooh House') | fields id, title, author; - fetched rows / total rows = 2/2 - +----+--------------------------+----------------------+ - | id | title | author | - |----+--------------------------+----------------------| - | 1 | The House at Pooh Corner | Alan Alexander Milne | - | 2 | Winnie-the-Pooh | Alan Alexander Milne | - +----+--------------------------+----------------------+ - - -SIMPLE_QUERY_STRING -------------------- - -Description ->>>>>>>>>>> - -``simple_query_string([field_expression+], query_expression[, option=]*)`` - -``simple_query_string(query_expression[, option=]*)`` - -The simple_query_string function maps to the simple_query_string query used in search engine, to return the documents that match a provided text, number, date or boolean value with a given field or fields. - -**Two syntax forms are supported:** - -1. **With explicit fields** (classic syntax): ``simple_query_string([field_list], query, ...)`` -2. **Without fields** (search default fields): ``simple_query_string(query, ...)`` - -When fields are omitted, the query searches in the fields specified by the ``index.query.default_field`` setting. - -The **^** lets you *boost* certain fields. Boosts are multipliers that weigh matches in one field more heavily than matches in other fields. The syntax allows to specify the fields in double quotes, single quotes, in backtick or even without any wrap. All fields search using star ``"*"`` is also available (star symbol should be wrapped). The weight is optional and should be specified using after the field name, it could be delimeted by the `caret` character or by whitespace. Please, refer to examples below: - -| ``simple_query_string(["Tags" ^ 2, 'Title' 3.4, `Body`, Comments ^ 0.3], ...)`` -| ``simple_query_string(["*"], ...)`` -| ``simple_query_string("search text", ...)`` (searches default fields) - - -Available parameters include: - -- analyze_wildcard -- analyzer -- auto_generate_synonyms_phrase -- flags -- fuzziness -- fuzzy_max_expansions -- fuzzy_prefix_length -- fuzzy_transpositions -- lenient -- default_operator -- minimum_should_match -- quote_field_suffix -- boost - -Example with only ``fields`` and ``query`` expressions, and all other parameters are set default values:: - - os> source=books | where simple_query_string(['title'], 'Pooh House') | fields id, title, author; - fetched rows / total rows = 2/2 - +----+--------------------------+----------------------+ - | id | title | author | - |----+--------------------------+----------------------| - | 1 | The House at Pooh Corner | Alan Alexander Milne | - | 2 | Winnie-the-Pooh | Alan Alexander Milne | - +----+--------------------------+----------------------+ - -Another example to show how to set custom values for the optional parameters:: - - os> source=books | where simple_query_string(['title'], 'Pooh House', flags='ALL', default_operator='AND') | fields id, title, author; - fetched rows / total rows = 1/1 - +----+--------------------------+----------------------+ - | id | title | author | - |----+--------------------------+----------------------| - | 1 | The House at Pooh Corner | Alan Alexander Milne | - +----+--------------------------+----------------------+ - -Example using the new syntax without specifying fields (searches in index.query.default_field):: - - os> source=books | where simple_query_string('Pooh House') | fields id, title, author; - fetched rows / total rows = 2/2 - +----+--------------------------+----------------------+ - | id | title | author | - |----+--------------------------+----------------------| - | 1 | The House at Pooh Corner | Alan Alexander Milne | - | 2 | Winnie-the-Pooh | Alan Alexander Milne | - +----+--------------------------+----------------------+ - - -MATCH_BOOL_PREFIX ------------------ - -Description ->>>>>>>>>>> - -``match_bool_prefix(field_expression, query_expression)`` - -The match_bool_prefix function maps to the match_bool_prefix query in the search engine. match_bool_prefix creates a match query from all but the last term in the query string. The last term is used to create a prefix query. - -- analyzer -- fuzziness -- max_expansions -- prefix_length -- fuzzy_transpositions -- operator -- fuzzy_rewrite -- minimum_should_match -- boost - -Example with only ``field`` and ``query`` expressions, and all other parameters are set default values:: - - os> source=accounts | where match_bool_prefix(address, 'Bristol Stre') | fields firstname, address - fetched rows / total rows = 2/2 - +-----------+--------------------+ - | firstname | address | - |-----------+--------------------| - | Hattie | 671 Bristol Street | - | Nanette | 789 Madison Street | - +-----------+--------------------+ - -Another example to show how to set custom values for the optional parameters:: - - os> source=accounts | where match_bool_prefix(address, 'Bristol Stre', minimum_should_match = 2) | fields firstname, address - fetched rows / total rows = 1/1 - +-----------+--------------------+ - | firstname | address | - |-----------+--------------------| - | Hattie | 671 Bristol Street | - +-----------+--------------------+ - - -QUERY_STRING ------------- - -Description ->>>>>>>>>>> - -``query_string([field_expression+], query_expression[, option=]*)`` - -``query_string(query_expression[, option=]*)`` - -The query_string function maps to the query_string query used in search engine, to return the documents that match a provided text, number, date or boolean value with a given field or fields. - -**Two syntax forms are supported:** - -1. **With explicit fields** (classic syntax): ``query_string([field_list], query, ...)`` -2. **Without fields** (search default fields): ``query_string(query, ...)`` - -When fields are omitted, the query searches in the fields specified by the ``index.query.default_field`` setting. - -The **^** lets you *boost* certain fields. Boosts are multipliers that weigh matches in one field more heavily than matches in other fields. The syntax allows to specify the fields in double quotes, -single quotes, in backtick or even without any wrap. All fields search using star ``"*"`` is also available (star symbol should be wrapped). The weight is optional and should be specified using after the field name, -it could be delimeted by the `caret` character or by whitespace. Please, refer to examples below: - -| ``query_string(["Tags" ^ 2, 'Title' 3.4, `Body`, Comments ^ 0.3], ...)`` -| ``query_string(["*"], ...)`` -| ``query_string("search text", ...)`` (searches default fields) - - -Available parameters include: - -- analyzer -- escape -- allow_leading_wildcard -- analyze_wildcard -- auto_generate_synonyms_phrase_query -- boost -- default_operator -- enable_position_increments -- fuzziness -- fuzzy_max_expansions -- fuzzy_prefix_length -- fuzzy_transpositions -- fuzzy_rewrite -- tie_breaker -- lenient -- type -- max_determinized_states -- minimum_should_match -- quote_analyzer -- phrase_slop -- quote_field_suffix -- rewrite -- time_zone - -Example with only ``fields`` and ``query`` expressions, and all other parameters are set default values:: - - os> source=books | where query_string(['title'], 'Pooh House') | fields id, title, author; - fetched rows / total rows = 2/2 - +----+--------------------------+----------------------+ - | id | title | author | - |----+--------------------------+----------------------| - | 1 | The House at Pooh Corner | Alan Alexander Milne | - | 2 | Winnie-the-Pooh | Alan Alexander Milne | - +----+--------------------------+----------------------+ - -Another example to show how to set custom values for the optional parameters:: - - os> source=books | where query_string(['title'], 'Pooh House', default_operator='AND') | fields id, title, author; - fetched rows / total rows = 1/1 - +----+--------------------------+----------------------+ - | id | title | author | - |----+--------------------------+----------------------| - | 1 | The House at Pooh Corner | Alan Alexander Milne | - +----+--------------------------+----------------------+ - -Example using the new syntax without specifying fields (searches in index.query.default_field):: - - os> source=books | where query_string('Pooh House') | fields id, title, author; - fetched rows / total rows = 2/2 - +----+--------------------------+----------------------+ - | id | title | author | - |----+--------------------------+----------------------| - | 1 | The House at Pooh Corner | Alan Alexander Milne | - | 2 | Winnie-the-Pooh | Alan Alexander Milne | - +----+--------------------------+----------------------+ - -Limitations ->>>>>>>>>>> - -The relevance functions are available to execute only in OpenSearch DSL but not in memory as of now, so the relevance search might fail for queries that are too complex to translate into DSL if the relevance function is following after a complex PPL query. To make your queries always work-able, it is recommended to place the relevance commands as close to the search command as possible, to ensure the relevance functions are eligible to push down. For example, a complex query like ``search source = people | rename firstname as name | dedup account_number | fields name, account_number, balance, employer | where match(employer, 'Open Search') | stats count() by city`` could fail because it is difficult to translate to DSL, but it would be better if we rewrite it to an equivalent query as ``search source = people | where match(employer, 'Open Search') | rename firstname as name | dedup account_number | fields name, account_number, balance, employer | stats count() by city`` by moving the where command with relevance function to the second command right after the search command, and the relevance would be optimized and executed smoothly in OpenSearch DSL. See `Optimization <../../optimization/optimization.rst>`_ to get more details about the query engine optimization. diff --git a/docs/user/ppl/functions/statistical.md b/docs/user/ppl/functions/statistical.md new file mode 100644 index 00000000000..b1098566916 --- /dev/null +++ b/docs/user/ppl/functions/statistical.md @@ -0,0 +1,142 @@ +# Statistical Functions + +## MAX + +### Description + +Usage: max(x, y, ...) returns the maximum value from all provided arguments. Strings are treated as greater than numbers, so if provided both strings and numbers, it will return the maximum string value (lexicographically ordered) +Note: This function is only available in the eval command context. +Argument type: Variable number of INTEGER/LONG/FLOAT/DOUBLE/STRING arguments +Return type: Type of the selected argument +Example + +```ppl +source=accounts +| eval max_val = MAX(age, 30) +| fields age, max_val +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----+---------+ +| age | max_val | +|-----+---------| +| 32 | 32 | +| 36 | 36 | +| 28 | 30 | +| 33 | 33 | ++-----+---------+ +``` + +```ppl +source=accounts +| eval result = MAX(firstname, 'John') +| fields firstname, result +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------+---------+ +| firstname | result | +|-----------+---------| +| Amber | John | +| Hattie | John | +| Nanette | Nanette | +| Dale | John | ++-----------+---------+ +``` + +```ppl +source=accounts +| eval result = MAX(age, 35, 'John', firstname) +| fields age, firstname, result +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----+-----------+---------+ +| age | firstname | result | +|-----+-----------+---------| +| 32 | Amber | John | +| 36 | Hattie | John | +| 28 | Nanette | Nanette | +| 33 | Dale | John | ++-----+-----------+---------+ +``` + +## MIN + +### Description + +Usage: min(x, y, ...) returns the minimum value from all provided arguments. Strings are treated as greater than numbers, so if provided both strings and numbers, it will return the minimum numeric value. +Note: This function is only available in the eval command context. +Argument type: Variable number of INTEGER/LONG/FLOAT/DOUBLE/STRING arguments +Return type: Type of the selected argument +Example + +```ppl +source=accounts +| eval min_val = MIN(age, 30) +| fields age, min_val +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----+---------+ +| age | min_val | +|-----+---------| +| 32 | 30 | +| 36 | 30 | +| 28 | 28 | +| 33 | 30 | ++-----+---------+ +``` + +```ppl +source=accounts +| eval result = MIN(firstname, 'John') +| fields firstname, result +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------+--------+ +| firstname | result | +|-----------+--------| +| Amber | Amber | +| Hattie | Hattie | +| Nanette | John | +| Dale | Dale | ++-----------+--------+ +``` + +```ppl +source=accounts +| eval result = MIN(age, 35, firstname) +| fields age, firstname, result +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----+-----------+--------+ +| age | firstname | result | +|-----+-----------+--------| +| 32 | Amber | 32 | +| 36 | Hattie | 35 | +| 28 | Nanette | 28 | +| 33 | Dale | 33 | ++-----+-----------+--------+ +``` + \ No newline at end of file diff --git a/docs/user/ppl/functions/statistical.rst b/docs/user/ppl/functions/statistical.rst deleted file mode 100644 index 3729c1991ca..00000000000 --- a/docs/user/ppl/functions/statistical.rst +++ /dev/null @@ -1,109 +0,0 @@ -====================== -Statistical Functions -====================== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 1 - - -MAX ---- - -Description ->>>>>>>>>>> - -Usage: max(x, y, ...) returns the maximum value from all provided arguments. Strings are treated as greater than numbers, so if provided both strings and numbers, it will return the maximum string value (lexicographically ordered) - -Note: This function is only available in the eval command context. - -Argument type: Variable number of INTEGER/LONG/FLOAT/DOUBLE/STRING arguments - -Return type: Type of the selected argument - -Example:: - - os> source=accounts | eval max_val = MAX(age, 30) | fields age, max_val - fetched rows / total rows = 4/4 - +-----+---------+ - | age | max_val | - |-----+---------| - | 32 | 32 | - | 36 | 36 | - | 28 | 30 | - | 33 | 33 | - +-----+---------+ - - os> source=accounts | eval result = MAX(firstname, 'John') | fields firstname, result - fetched rows / total rows = 4/4 - +-----------+---------+ - | firstname | result | - |-----------+---------| - | Amber | John | - | Hattie | John | - | Nanette | Nanette | - | Dale | John | - +-----------+---------+ - - os> source=accounts | eval result = MAX(age, 35, 'John', firstname) | fields age, firstname, result - fetched rows / total rows = 4/4 - +-----+-----------+---------+ - | age | firstname | result | - |-----+-----------+---------| - | 32 | Amber | John | - | 36 | Hattie | John | - | 28 | Nanette | Nanette | - | 33 | Dale | John | - +-----+-----------+---------+ - - -MIN ---- - -Description ->>>>>>>>>>> - -Usage: min(x, y, ...) returns the minimum value from all provided arguments. Strings are treated as greater than numbers, so if provided both strings and numbers, it will return the minimum numeric value. - -Note: This function is only available in the eval command context. - -Argument type: Variable number of INTEGER/LONG/FLOAT/DOUBLE/STRING arguments - -Return type: Type of the selected argument - -Example:: - - os> source=accounts | eval min_val = MIN(age, 30) | fields age, min_val - fetched rows / total rows = 4/4 - +-----+---------+ - | age | min_val | - |-----+---------| - | 32 | 30 | - | 36 | 30 | - | 28 | 28 | - | 33 | 30 | - +-----+---------+ - - os> source=accounts | eval result = MIN(firstname, 'John') | fields firstname, result - fetched rows / total rows = 4/4 - +-----------+--------+ - | firstname | result | - |-----------+--------| - | Amber | Amber | - | Hattie | Hattie | - | Nanette | John | - | Dale | Dale | - +-----------+--------+ - - os> source=accounts | eval result = MIN(age, 35, firstname) | fields age, firstname, result - fetched rows / total rows = 4/4 - +-----+-----------+--------+ - | age | firstname | result | - |-----+-----------+--------| - | 32 | Amber | 32 | - | 36 | Hattie | 35 | - | 28 | Nanette | 28 | - | 33 | Dale | 33 | - +-----+-----------+--------+ \ No newline at end of file diff --git a/docs/user/ppl/functions/string.md b/docs/user/ppl/functions/string.md new file mode 100644 index 00000000000..04a3485c492 --- /dev/null +++ b/docs/user/ppl/functions/string.md @@ -0,0 +1,549 @@ +# String Functions + +## CONCAT + +### Description + +Usage: CONCAT(str1, str2, ...., str_9) adds up to 9 strings together. +Argument type: STRING, STRING, ...., STRING +Return type: STRING +Example + +```ppl +source=people +| eval `CONCAT('hello', 'world')` = CONCAT('hello', 'world'), `CONCAT('hello ', 'whole ', 'world', '!')` = CONCAT('hello ', 'whole ', 'world', '!') +| fields `CONCAT('hello', 'world')`, `CONCAT('hello ', 'whole ', 'world', '!')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------------+------------------------------------------+ +| CONCAT('hello', 'world') | CONCAT('hello ', 'whole ', 'world', '!') | +|--------------------------+------------------------------------------| +| helloworld | hello whole world! | ++--------------------------+------------------------------------------+ +``` + +## CONCAT_WS + +### Description + +Usage: CONCAT_WS(sep, str1, str2) returns str1 concatenated with str2 using sep as a separator between them. +Argument type: STRING, STRING, STRING +Return type: STRING +Example + +```ppl +source=people +| eval `CONCAT_WS(',', 'hello', 'world')` = CONCAT_WS(',', 'hello', 'world') +| fields `CONCAT_WS(',', 'hello', 'world')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------------------------+ +| CONCAT_WS(',', 'hello', 'world') | +|----------------------------------| +| hello,world | ++----------------------------------+ +``` + +## LENGTH + +### Description + +Specifications: +1. LENGTH(STRING) -> INTEGER + +Usage: length(str) returns length of string measured in bytes. +Argument type: STRING +Return type: INTEGER +Example + +```ppl +source=people +| eval `LENGTH('helloworld')` = LENGTH('helloworld') +| fields `LENGTH('helloworld')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------------+ +| LENGTH('helloworld') | +|----------------------| +| 10 | ++----------------------+ +``` + +## LIKE + +### Description + +Usage: like(string, PATTERN[, case_sensitive]) return true if the string match the PATTERN. `case_sensitive` is optional. When set to `true`, PATTERN is **case-sensitive**. **Default:** Determined by `plugins.ppl.syntax.legacy.preferred`. + * When `plugins.ppl.syntax.legacy.preferred=true`, `case_sensitive` defaults to `false` + * When `plugins.ppl.syntax.legacy.preferred=false`, `case_sensitive` defaults to `true` + +There are two wildcards often used in conjunction with the LIKE operator: +* `%` - The percent sign represents zero, one, or multiple characters +* `_` - The underscore represents a single character + +Argument type: STRING, STRING [, BOOLEAN] +Return type: INTEGER +Example + +```ppl +source=people +| eval `LIKE('hello world', '_ello%')` = LIKE('hello world', '_ello%'), `LIKE('hello world', '_ELLo%', true)` = LIKE('hello world', '_ELLo%', true), `LIKE('hello world', '_ELLo%', false)` = LIKE('hello world', '_ELLo%', false) +| fields `LIKE('hello world', '_ello%')`, `LIKE('hello world', '_ELLo%', true)`, `LIKE('hello world', '_ELLo%', false)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------------------+-------------------------------------+--------------------------------------+ +| LIKE('hello world', '_ello%') | LIKE('hello world', '_ELLo%', true) | LIKE('hello world', '_ELLo%', false) | +|-------------------------------+-------------------------------------+--------------------------------------| +| True | False | True | ++-------------------------------+-------------------------------------+--------------------------------------+ +``` + +Limitation: The pushdown of the LIKE function to a DSL wildcard query is supported only for keyword fields. +## ILIKE + +### Description + +Usage: ilike(string, PATTERN) return true if the string match the PATTERN, PATTERN is **case-insensitive**. +There are two wildcards often used in conjunction with the ILIKE operator: +* `%` - The percent sign represents zero, one, or multiple characters +* `_` - The underscore represents a single character + +Argument type: STRING, STRING +Return type: INTEGER +Example + +```ppl +source=people +| eval `ILIKE('hello world', '_ELLo%')` = ILIKE('hello world', '_ELLo%') +| fields `ILIKE('hello world', '_ELLo%')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------------------+ +| ILIKE('hello world', '_ELLo%') | +|--------------------------------| +| True | ++--------------------------------+ +``` + +Limitation: The pushdown of the ILIKE function to a DSL wildcard query is supported only for keyword fields. +## LOCATE + +### Description + +Usage: locate(substr, str[, start]) returns the position of the first occurrence of substring substr in string str, starting searching from position start. If start is not specified, it defaults to 1 (the beginning of the string). Returns 0 if substr is not found. If any argument is NULL, the function returns NULL. +Argument type: STRING, STRING[, INTEGER] +Return type: INTEGER +Example + +```ppl +source=people +| eval `LOCATE('world', 'helloworld')` = LOCATE('world', 'helloworld'), `LOCATE('invalid', 'helloworld')` = LOCATE('invalid', 'helloworld'), `LOCATE('world', 'helloworld', 6)` = LOCATE('world', 'helloworld', 6) +| fields `LOCATE('world', 'helloworld')`, `LOCATE('invalid', 'helloworld')`, `LOCATE('world', 'helloworld', 6)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------------------+---------------------------------+----------------------------------+ +| LOCATE('world', 'helloworld') | LOCATE('invalid', 'helloworld') | LOCATE('world', 'helloworld', 6) | +|-------------------------------+---------------------------------+----------------------------------| +| 6 | 0 | 6 | ++-------------------------------+---------------------------------+----------------------------------+ +``` + +## LOWER + +### Description + +Usage: lower(string) converts the string to lowercase. +Argument type: STRING +Return type: STRING +Example + +```ppl +source=people +| eval `LOWER('helloworld')` = LOWER('helloworld'), `LOWER('HELLOWORLD')` = LOWER('HELLOWORLD') +| fields `LOWER('helloworld')`, `LOWER('HELLOWORLD')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------------+---------------------+ +| LOWER('helloworld') | LOWER('HELLOWORLD') | +|---------------------+---------------------| +| helloworld | helloworld | ++---------------------+---------------------+ +``` + +## LTRIM + +### Description + +Usage: ltrim(str) trims leading space characters from the string. +Argument type: STRING +Return type: STRING +Example + +```ppl +source=people +| eval `LTRIM(' hello')` = LTRIM(' hello'), `LTRIM('hello ')` = LTRIM('hello ') +| fields `LTRIM(' hello')`, `LTRIM('hello ')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------+-------------------+ +| LTRIM(' hello') | LTRIM('hello ') | +|-------------------+-------------------| +| hello | hello | ++-------------------+-------------------+ +``` + +## POSITION + +### Description + +Usage: The syntax POSITION(substr IN str) returns the position of the first occurrence of substring substr in string str. Returns 0 if substr is not in str. Returns NULL if any argument is NULL. +Argument type: STRING, STRING +Return type INTEGER +(STRING IN STRING) -> INTEGER +Example + +```ppl +source=people +| eval `POSITION('world' IN 'helloworld')` = POSITION('world' IN 'helloworld'), `POSITION('invalid' IN 'helloworld')`= POSITION('invalid' IN 'helloworld') +| fields `POSITION('world' IN 'helloworld')`, `POSITION('invalid' IN 'helloworld')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------------------+-------------------------------------+ +| POSITION('world' IN 'helloworld') | POSITION('invalid' IN 'helloworld') | +|-----------------------------------+-------------------------------------| +| 6 | 0 | ++-----------------------------------+-------------------------------------+ +``` + +## REPLACE + +### Description + +Usage: replace(str, pattern, replacement) returns a string with all occurrences of the pattern replaced by the replacement string in str. If any argument is NULL, the function returns NULL. +**Regular Expression Support**: The pattern argument supports Java regex syntax, including: +Argument type: STRING, STRING (regex pattern), STRING (replacement) +Return type: STRING +**Important - Regex Special Characters**: The pattern is interpreted as a regular expression. Characters like `.`, `*`, `+`, `[`, `]`, `(`, `)`, `{`, `}`, `^`, `$`, `|`, `?`, and `\` have special meaning in regex. To match them literally, escape with backslashes: +* To match `example.com`: use `'example\\.com'` (escape the dots) +* To match `value*`: use `'value\\*'` (escape the asterisk) +* To match `price+tax`: use `'price\\+tax'` (escape the plus) + +For strings with many special characters, use `\\Q...\\E` to quote the entire literal string (e.g., `'\\Qhttps://example.com/path?id=123\\E'` matches that exact URL). +Literal String Replacement Examples + +```ppl +source=people +| eval `REPLACE('helloworld', 'world', 'universe')` = REPLACE('helloworld', 'world', 'universe'), `REPLACE('helloworld', 'invalid', 'universe')` = REPLACE('helloworld', 'invalid', 'universe') +| fields `REPLACE('helloworld', 'world', 'universe')`, `REPLACE('helloworld', 'invalid', 'universe')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------------------------------------+----------------------------------------------+ +| REPLACE('helloworld', 'world', 'universe') | REPLACE('helloworld', 'invalid', 'universe') | +|--------------------------------------------+----------------------------------------------| +| hellouniverse | helloworld | ++--------------------------------------------+----------------------------------------------+ +``` + +Escaping Special Characters Examples + +```ppl +source=people +| eval `Replace domain` = REPLACE('api.example.com', 'example\\.com', 'newsite.org'), `Replace with quote` = REPLACE('https://api.example.com/v1', '\\Qhttps://api.example.com\\E', 'http://localhost:8080') +| fields `Replace domain`, `Replace with quote` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------+--------------------------+ +| Replace domain | Replace with quote | +|-----------------+--------------------------| +| api.newsite.org | http://localhost:8080/v1 | ++-----------------+--------------------------+ +``` + +Regex Pattern Examples + +```ppl +source=people +| eval `Remove digits` = REPLACE('test123', '\\d+', ''), `Collapse spaces` = REPLACE('hello world', ' +', ' '), `Remove special` = REPLACE('hello@world!', '[^a-zA-Z]', '') +| fields `Remove digits`, `Collapse spaces`, `Remove special` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------+-----------------+----------------+ +| Remove digits | Collapse spaces | Remove special | +|---------------+-----------------+----------------| +| test | hello world | helloworld | ++---------------+-----------------+----------------+ +``` + +Capture Group and Backreference Examples + +```ppl +source=people +| eval `Swap date` = REPLACE('1/14/2023', '^(\\d{1,2})/(\\d{1,2})/', '$2/$1/'), `Reverse words` = REPLACE('Hello World', '(\\w+) (\\w+)', '$2 $1'), `Extract domain` = REPLACE('user@example.com', '.*@(.+)', '$1') +| fields `Swap date`, `Reverse words`, `Extract domain` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------+---------------+----------------+ +| Swap date | Reverse words | Extract domain | +|-----------+---------------+----------------| +| 14/1/2023 | World Hello | example.com | ++-----------+---------------+----------------+ +``` + +Advanced Regex Examples + +```ppl +source=people +| eval `Clean phone` = REPLACE('(555) 123-4567', '[^0-9]', ''), `Remove vowels` = REPLACE('hello world', '[aeiou]', ''), `Add prefix` = REPLACE('test', '^', 'pre_') +| fields `Clean phone`, `Remove vowels`, `Add prefix` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------+---------------+------------+ +| Clean phone | Remove vowels | Add prefix | +|-------------+---------------+------------| +| 5551234567 | hll wrld | pre_test | ++-------------+---------------+------------+ +``` + +**Note**: When using regex patterns in PPL queries: +* Backslashes must be escaped (use `\\` instead of `\`) - e.g., `\\d` for digit pattern, `\\w+` for word characters +* Backreferences support both PCRE-style (`\1`, `\2`, etc.) and Java-style (`$1`, `$2`, etc.) syntax. PCRE-style backreferences are automatically converted to Java-style internally. + +## REVERSE + +### Description + +Usage: REVERSE(str) returns reversed string of the string supplied as an argument. +Argument type: STRING +Return type: STRING +Example + +```ppl +source=people +| eval `REVERSE('abcde')` = REVERSE('abcde') +| fields `REVERSE('abcde')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++------------------+ +| REVERSE('abcde') | +|------------------| +| edcba | ++------------------+ +``` + +## RIGHT + +### Description + +Usage: right(str, len) returns the rightmost len characters from the string str, or NULL if any argument is NULL. +Argument type: STRING, INTEGER +Return type: STRING +Example + +```ppl +source=people +| eval `RIGHT('helloworld', 5)` = RIGHT('helloworld', 5), `RIGHT('HELLOWORLD', 0)` = RIGHT('HELLOWORLD', 0) +| fields `RIGHT('helloworld', 5)`, `RIGHT('HELLOWORLD', 0)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++------------------------+------------------------+ +| RIGHT('helloworld', 5) | RIGHT('HELLOWORLD', 0) | +|------------------------+------------------------| +| world | | ++------------------------+------------------------+ +``` + +## RTRIM + +### Description + +Usage: rtrim(str) trims trailing space characters from the string. +Argument type: STRING +Return type: STRING +Example + +```ppl +source=people +| eval `RTRIM(' hello')` = RTRIM(' hello'), `RTRIM('hello ')` = RTRIM('hello ') +| fields `RTRIM(' hello')`, `RTRIM('hello ')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-------------------+-------------------+ +| RTRIM(' hello') | RTRIM('hello ') | +|-------------------+-------------------| +| hello | hello | ++-------------------+-------------------+ +``` + +## SUBSTRING + +### Description + +Usage: substring(str, start) or substring(str, start, length) returns substring using start and length. With no length, entire string from start is returned. +Argument type: STRING, INTEGER, INTEGER +Return type: STRING +Synonyms: SUBSTR +Example + +```ppl +source=people +| eval `SUBSTRING('helloworld', 5)` = SUBSTRING('helloworld', 5), `SUBSTRING('helloworld', 5, 3)` = SUBSTRING('helloworld', 5, 3) +| fields `SUBSTRING('helloworld', 5)`, `SUBSTRING('helloworld', 5, 3)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------------------+-------------------------------+ +| SUBSTRING('helloworld', 5) | SUBSTRING('helloworld', 5, 3) | +|----------------------------+-------------------------------| +| oworld | owo | ++----------------------------+-------------------------------+ +``` + +## TRIM + +### Description + +Argument Type: STRING +Return type: STRING +Example + +```ppl +source=people +| eval `TRIM(' hello')` = TRIM(' hello'), `TRIM('hello ')` = TRIM('hello ') +| fields `TRIM(' hello')`, `TRIM('hello ')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++------------------+------------------+ +| TRIM(' hello') | TRIM('hello ') | +|------------------+------------------| +| hello | hello | ++------------------+------------------+ +``` + +## UPPER + +### Description + +Usage: upper(string) converts the string to uppercase. +Argument type: STRING +Return type: STRING +Example + +```ppl +source=people +| eval `UPPER('helloworld')` = UPPER('helloworld'), `UPPER('HELLOWORLD')` = UPPER('HELLOWORLD') +| fields `UPPER('helloworld')`, `UPPER('HELLOWORLD')` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------------------+---------------------+ +| UPPER('helloworld') | UPPER('HELLOWORLD') | +|---------------------+---------------------| +| HELLOWORLD | HELLOWORLD | ++---------------------+---------------------+ +``` + +## REGEXP_REPLACE + +### Description + +Usage: regexp_replace(str, pattern, replacement) replace all substrings of the string value that match pattern with replacement and returns modified string value. +Argument type: STRING, STRING, STRING +Return type: STRING +Synonyms: [REPLACE](#replace) +Example + +```ppl +source=people +| eval `DOMAIN` = REGEXP_REPLACE('https://opensearch.org/downloads/', '^https?://(?:www\.)?([^/]+)/.*$', '\1') +| fields `DOMAIN` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++----------------+ +| DOMAIN | +|----------------| +| opensearch.org | ++----------------+ +``` + \ No newline at end of file diff --git a/docs/user/ppl/functions/string.rst b/docs/user/ppl/functions/string.rst deleted file mode 100644 index 3e94d220094..00000000000 --- a/docs/user/ppl/functions/string.rst +++ /dev/null @@ -1,479 +0,0 @@ -================ -String Functions -================ - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 1 - -CONCAT ------- - -Description ->>>>>>>>>>> - -Usage: CONCAT(str1, str2, ...., str_9) adds up to 9 strings together. - -Argument type: STRING, STRING, ...., STRING - -Return type: STRING - -Example:: - - os> source=people | eval `CONCAT('hello', 'world')` = CONCAT('hello', 'world'), `CONCAT('hello ', 'whole ', 'world', '!')` = CONCAT('hello ', 'whole ', 'world', '!') | fields `CONCAT('hello', 'world')`, `CONCAT('hello ', 'whole ', 'world', '!')` - fetched rows / total rows = 1/1 - +--------------------------+------------------------------------------+ - | CONCAT('hello', 'world') | CONCAT('hello ', 'whole ', 'world', '!') | - |--------------------------+------------------------------------------| - | helloworld | hello whole world! | - +--------------------------+------------------------------------------+ - - -CONCAT_WS ---------- - -Description ->>>>>>>>>>> - -Usage: CONCAT_WS(sep, str1, str2) returns str1 concatenated with str2 using sep as a separator between them. - -Argument type: STRING, STRING, STRING - -Return type: STRING - -Example:: - - os> source=people | eval `CONCAT_WS(',', 'hello', 'world')` = CONCAT_WS(',', 'hello', 'world') | fields `CONCAT_WS(',', 'hello', 'world')` - fetched rows / total rows = 1/1 - +----------------------------------+ - | CONCAT_WS(',', 'hello', 'world') | - |----------------------------------| - | hello,world | - +----------------------------------+ - - -LENGTH ------- - -Description ->>>>>>>>>>> - -Specifications: - -1. LENGTH(STRING) -> INTEGER - -Usage: length(str) returns length of string measured in bytes. - -Argument type: STRING - -Return type: INTEGER - -Example:: - - os> source=people | eval `LENGTH('helloworld')` = LENGTH('helloworld') | fields `LENGTH('helloworld')` - fetched rows / total rows = 1/1 - +----------------------+ - | LENGTH('helloworld') | - |----------------------| - | 10 | - +----------------------+ - -LIKE ----- - -Description ->>>>>>>>>>> - -Usage: like(string, PATTERN[, case_sensitive]) return true if the string match the PATTERN. ``case_sensitive`` is optional. When set to ``true``, PATTERN is **case-sensitive**. **Default:** Determined by ``plugins.ppl.syntax.legacy.preferred``. - - * When ``plugins.ppl.syntax.legacy.preferred=true``, ``case_sensitive`` defaults to ``false`` - * When ``plugins.ppl.syntax.legacy.preferred=false``, ``case_sensitive`` defaults to ``true`` - -There are two wildcards often used in conjunction with the LIKE operator: - -* ``%`` - The percent sign represents zero, one, or multiple characters -* ``_`` - The underscore represents a single character - -Argument type: STRING, STRING [, BOOLEAN] - -Return type: INTEGER - -Example:: - - os> source=people | eval `LIKE('hello world', '_ello%')` = LIKE('hello world', '_ello%'), `LIKE('hello world', '_ELLo%', true)` = LIKE('hello world', '_ELLo%', true), `LIKE('hello world', '_ELLo%', false)` = LIKE('hello world', '_ELLo%', false) | fields `LIKE('hello world', '_ello%')`, `LIKE('hello world', '_ELLo%', true)`, `LIKE('hello world', '_ELLo%', false)` - fetched rows / total rows = 1/1 - +-------------------------------+-------------------------------------+--------------------------------------+ - | LIKE('hello world', '_ello%') | LIKE('hello world', '_ELLo%', true) | LIKE('hello world', '_ELLo%', false) | - |-------------------------------+-------------------------------------+--------------------------------------| - | True | False | True | - +-------------------------------+-------------------------------------+--------------------------------------+ - - -Limitation: The pushdown of the LIKE function to a DSL wildcard query is supported only for keyword fields. - -ILIKE ----- - -Description ->>>>>>>>>>> - -Usage: ilike(string, PATTERN) return true if the string match the PATTERN, PATTERN is **case-insensitive**. - -There are two wildcards often used in conjunction with the ILIKE operator: - -* ``%`` - The percent sign represents zero, one, or multiple characters -* ``_`` - The underscore represents a single character - -Argument type: STRING, STRING - -Return type: INTEGER - -Example:: - - os> source=people | eval `ILIKE('hello world', '_ELLo%')` = ILIKE('hello world', '_ELLo%') | fields `ILIKE('hello world', '_ELLo%')` - fetched rows / total rows = 1/1 - +--------------------------------+ - | ILIKE('hello world', '_ELLo%') | - |--------------------------------| - | True | - +--------------------------------+ - - -Limitation: The pushdown of the ILIKE function to a DSL wildcard query is supported only for keyword fields. - -LOCATE -------- - -Description ->>>>>>>>>>> - -Usage: locate(substr, str[, start]) returns the position of the first occurrence of substring substr in string str, starting searching from position start. If start is not specified, it defaults to 1 (the beginning of the string). Returns 0 if substr is not found. If any argument is NULL, the function returns NULL. - -Argument type: STRING, STRING[, INTEGER] - -Return type: INTEGER - -Example:: - - os> source=people | eval `LOCATE('world', 'helloworld')` = LOCATE('world', 'helloworld'), `LOCATE('invalid', 'helloworld')` = LOCATE('invalid', 'helloworld'), `LOCATE('world', 'helloworld', 6)` = LOCATE('world', 'helloworld', 6) | fields `LOCATE('world', 'helloworld')`, `LOCATE('invalid', 'helloworld')`, `LOCATE('world', 'helloworld', 6)` - fetched rows / total rows = 1/1 - +-------------------------------+---------------------------------+----------------------------------+ - | LOCATE('world', 'helloworld') | LOCATE('invalid', 'helloworld') | LOCATE('world', 'helloworld', 6) | - |-------------------------------+---------------------------------+----------------------------------| - | 6 | 0 | 6 | - +-------------------------------+---------------------------------+----------------------------------+ - - -LOWER ------ - -Description ->>>>>>>>>>> - -Usage: lower(string) converts the string to lowercase. - -Argument type: STRING - -Return type: STRING - -Example:: - - os> source=people | eval `LOWER('helloworld')` = LOWER('helloworld'), `LOWER('HELLOWORLD')` = LOWER('HELLOWORLD') | fields `LOWER('helloworld')`, `LOWER('HELLOWORLD')` - fetched rows / total rows = 1/1 - +---------------------+---------------------+ - | LOWER('helloworld') | LOWER('HELLOWORLD') | - |---------------------+---------------------| - | helloworld | helloworld | - +---------------------+---------------------+ - - -LTRIM ------ - -Description ->>>>>>>>>>> - -Usage: ltrim(str) trims leading space characters from the string. - -Argument type: STRING - -Return type: STRING - -Example:: - - os> source=people | eval `LTRIM(' hello')` = LTRIM(' hello'), `LTRIM('hello ')` = LTRIM('hello ') | fields `LTRIM(' hello')`, `LTRIM('hello ')` - fetched rows / total rows = 1/1 - +-------------------+-------------------+ - | LTRIM(' hello') | LTRIM('hello ') | - |-------------------+-------------------| - | hello | hello | - +-------------------+-------------------+ - - -POSITION --------- - -Description ->>>>>>>>>>> - -Usage: The syntax POSITION(substr IN str) returns the position of the first occurrence of substring substr in string str. Returns 0 if substr is not in str. Returns NULL if any argument is NULL. - -Argument type: STRING, STRING - -Return type INTEGER - -(STRING IN STRING) -> INTEGER - -Example:: - - os> source=people | eval `POSITION('world' IN 'helloworld')` = POSITION('world' IN 'helloworld'), `POSITION('invalid' IN 'helloworld')`= POSITION('invalid' IN 'helloworld') | fields `POSITION('world' IN 'helloworld')`, `POSITION('invalid' IN 'helloworld')` - fetched rows / total rows = 1/1 - +-----------------------------------+-------------------------------------+ - | POSITION('world' IN 'helloworld') | POSITION('invalid' IN 'helloworld') | - |-----------------------------------+-------------------------------------| - | 6 | 0 | - +-----------------------------------+-------------------------------------+ - - -REPLACE --------- - -Description ->>>>>>>>>>> - -Usage: replace(str, pattern, replacement) returns a string with all occurrences of the pattern replaced by the replacement string in str. If any argument is NULL, the function returns NULL. - -**Regular Expression Support**: The pattern argument supports Java regex syntax, including: - -Argument type: STRING, STRING (regex pattern), STRING (replacement) - -Return type: STRING - -**Important - Regex Special Characters**: The pattern is interpreted as a regular expression. Characters like ``.``, ``*``, ``+``, ``[``, ``]``, ``(``, ``)``, ``{``, ``}``, ``^``, ``$``, ``|``, ``?``, and ``\`` have special meaning in regex. To match them literally, escape with backslashes: - -* To match ``example.com``: use ``'example\\.com'`` (escape the dots) -* To match ``value*``: use ``'value\\*'`` (escape the asterisk) -* To match ``price+tax``: use ``'price\\+tax'`` (escape the plus) - -For strings with many special characters, use ``\\Q...\\E`` to quote the entire literal string (e.g., ``'\\Qhttps://example.com/path?id=123\\E'`` matches that exact URL). - -Literal String Replacement Examples:: - - os> source=people | eval `REPLACE('helloworld', 'world', 'universe')` = REPLACE('helloworld', 'world', 'universe'), `REPLACE('helloworld', 'invalid', 'universe')` = REPLACE('helloworld', 'invalid', 'universe') | fields `REPLACE('helloworld', 'world', 'universe')`, `REPLACE('helloworld', 'invalid', 'universe')` - fetched rows / total rows = 1/1 - +--------------------------------------------+----------------------------------------------+ - | REPLACE('helloworld', 'world', 'universe') | REPLACE('helloworld', 'invalid', 'universe') | - |--------------------------------------------+----------------------------------------------| - | hellouniverse | helloworld | - +--------------------------------------------+----------------------------------------------+ - -Escaping Special Characters Examples:: - - os> source=people | eval `Replace domain` = REPLACE('api.example.com', 'example\\.com', 'newsite.org'), `Replace with quote` = REPLACE('https://api.example.com/v1', '\\Qhttps://api.example.com\\E', 'http://localhost:8080') | fields `Replace domain`, `Replace with quote` - fetched rows / total rows = 1/1 - +-----------------+--------------------------+ - | Replace domain | Replace with quote | - |-----------------+--------------------------| - | api.newsite.org | http://localhost:8080/v1 | - +-----------------+--------------------------+ - -Regex Pattern Examples:: - - os> source=people | eval `Remove digits` = REPLACE('test123', '\\d+', ''), `Collapse spaces` = REPLACE('hello world', ' +', ' '), `Remove special` = REPLACE('hello@world!', '[^a-zA-Z]', '') | fields `Remove digits`, `Collapse spaces`, `Remove special` - fetched rows / total rows = 1/1 - +---------------+-----------------+----------------+ - | Remove digits | Collapse spaces | Remove special | - |---------------+-----------------+----------------| - | test | hello world | helloworld | - +---------------+-----------------+----------------+ - -Capture Group and Backreference Examples:: - - os> source=people | eval `Swap date` = REPLACE('1/14/2023', '^(\\d{1,2})/(\\d{1,2})/', '$2/$1/'), `Reverse words` = REPLACE('Hello World', '(\\w+) (\\w+)', '$2 $1'), `Extract domain` = REPLACE('user@example.com', '.*@(.+)', '$1') | fields `Swap date`, `Reverse words`, `Extract domain` - fetched rows / total rows = 1/1 - +-----------+---------------+----------------+ - | Swap date | Reverse words | Extract domain | - |-----------+---------------+----------------| - | 14/1/2023 | World Hello | example.com | - +-----------+---------------+----------------+ - -Advanced Regex Examples:: - - os> source=people | eval `Clean phone` = REPLACE('(555) 123-4567', '[^0-9]', ''), `Remove vowels` = REPLACE('hello world', '[aeiou]', ''), `Add prefix` = REPLACE('test', '^', 'pre_') | fields `Clean phone`, `Remove vowels`, `Add prefix` - fetched rows / total rows = 1/1 - +-------------+---------------+------------+ - | Clean phone | Remove vowels | Add prefix | - |-------------+---------------+------------| - | 5551234567 | hll wrld | pre_test | - +-------------+---------------+------------+ - -**Note**: When using regex patterns in PPL queries: - -* Backslashes must be escaped (use ``\\`` instead of ``\``) - e.g., ``\\d`` for digit pattern, ``\\w+`` for word characters -* Backreferences support both PCRE-style (``\1``, ``\2``, etc.) and Java-style (``$1``, ``$2``, etc.) syntax. PCRE-style backreferences are automatically converted to Java-style internally. - - -REVERSE -------- - -Description ->>>>>>>>>>> - -Usage: REVERSE(str) returns reversed string of the string supplied as an argument. - -Argument type: STRING - -Return type: STRING - -Example:: - - os> source=people | eval `REVERSE('abcde')` = REVERSE('abcde') | fields `REVERSE('abcde')` - fetched rows / total rows = 1/1 - +------------------+ - | REVERSE('abcde') | - |------------------| - | edcba | - +------------------+ - - -RIGHT ------ - -Description ->>>>>>>>>>> - -Usage: right(str, len) returns the rightmost len characters from the string str, or NULL if any argument is NULL. - -Argument type: STRING, INTEGER - -Return type: STRING - -Example:: - - os> source=people | eval `RIGHT('helloworld', 5)` = RIGHT('helloworld', 5), `RIGHT('HELLOWORLD', 0)` = RIGHT('HELLOWORLD', 0) | fields `RIGHT('helloworld', 5)`, `RIGHT('HELLOWORLD', 0)` - fetched rows / total rows = 1/1 - +------------------------+------------------------+ - | RIGHT('helloworld', 5) | RIGHT('HELLOWORLD', 0) | - |------------------------+------------------------| - | world | | - +------------------------+------------------------+ - - -RTRIM ------ - -Description ->>>>>>>>>>> - -Usage: rtrim(str) trims trailing space characters from the string. - -Argument type: STRING - -Return type: STRING - -Example:: - - os> source=people | eval `RTRIM(' hello')` = RTRIM(' hello'), `RTRIM('hello ')` = RTRIM('hello ') | fields `RTRIM(' hello')`, `RTRIM('hello ')` - fetched rows / total rows = 1/1 - +-------------------+-------------------+ - | RTRIM(' hello') | RTRIM('hello ') | - |-------------------+-------------------| - | hello | hello | - +-------------------+-------------------+ - - -SUBSTRING ---------- - -Description ->>>>>>>>>>> - -Usage: substring(str, start) or substring(str, start, length) returns substring using start and length. With no length, entire string from start is returned. - -Argument type: STRING, INTEGER, INTEGER - -Return type: STRING - -Synonyms: SUBSTR - -Example:: - - os> source=people | eval `SUBSTRING('helloworld', 5)` = SUBSTRING('helloworld', 5), `SUBSTRING('helloworld', 5, 3)` = SUBSTRING('helloworld', 5, 3) | fields `SUBSTRING('helloworld', 5)`, `SUBSTRING('helloworld', 5, 3)` - fetched rows / total rows = 1/1 - +----------------------------+-------------------------------+ - | SUBSTRING('helloworld', 5) | SUBSTRING('helloworld', 5, 3) | - |----------------------------+-------------------------------| - | oworld | owo | - +----------------------------+-------------------------------+ - - -TRIM ----- - -Description ->>>>>>>>>>> - -Argument Type: STRING - -Return type: STRING - -Example:: - - os> source=people | eval `TRIM(' hello')` = TRIM(' hello'), `TRIM('hello ')` = TRIM('hello ') | fields `TRIM(' hello')`, `TRIM('hello ')` - fetched rows / total rows = 1/1 - +------------------+------------------+ - | TRIM(' hello') | TRIM('hello ') | - |------------------+------------------| - | hello | hello | - +------------------+------------------+ - - -UPPER ------ - -Description ->>>>>>>>>>> - -Usage: upper(string) converts the string to uppercase. - -Argument type: STRING - -Return type: STRING - -Example:: - - os> source=people | eval `UPPER('helloworld')` = UPPER('helloworld'), `UPPER('HELLOWORLD')` = UPPER('HELLOWORLD') | fields `UPPER('helloworld')`, `UPPER('HELLOWORLD')` - fetched rows / total rows = 1/1 - +---------------------+---------------------+ - | UPPER('helloworld') | UPPER('HELLOWORLD') | - |---------------------+---------------------| - | HELLOWORLD | HELLOWORLD | - +---------------------+---------------------+ - - -REGEXP_REPLACE -------------- - -Description ->>>>>>>>>>> - -Usage: regexp_replace(str, pattern, replacement) replace all substrings of the string value that match pattern with replacement and returns modified string value. - -Argument type: STRING, STRING, STRING - -Return type: STRING - -Synonyms: `REPLACE`_ - -Example:: - - os> source=people | eval `DOMAIN` = REGEXP_REPLACE('https://opensearch.org/downloads/', '^https?://(?:www\.)?([^/]+)/.*$', '\1') | fields `DOMAIN` - fetched rows / total rows = 1/1 - +----------------+ - | DOMAIN | - |----------------| - | opensearch.org | - +----------------+ - diff --git a/docs/user/ppl/functions/system.md b/docs/user/ppl/functions/system.md new file mode 100644 index 00000000000..4eb2aeb8114 --- /dev/null +++ b/docs/user/ppl/functions/system.md @@ -0,0 +1,29 @@ +# System Functions + +## TYPEOF + +### Description + +Usage: typeof(expr) function returns name of the data type of the value that is passed to it. This can be helpful for troubleshooting or dynamically constructing SQL queries. +Argument type: ANY +Return type: STRING + +Example + +```ppl +source=people +| eval `typeof(date)` = typeof(DATE('2008-04-14')), `typeof(int)` = typeof(1), `typeof(now())` = typeof(now()), `typeof(column)` = typeof(accounts) +| fields `typeof(date)`, `typeof(int)`, `typeof(now())`, `typeof(column)` +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++--------------+-------------+---------------+----------------+ +| typeof(date) | typeof(int) | typeof(now()) | typeof(column) | +|--------------+-------------+---------------+----------------| +| DATE | INT | TIMESTAMP | STRUCT | ++--------------+-------------+---------------+----------------+ +``` + \ No newline at end of file diff --git a/docs/user/ppl/functions/system.rst b/docs/user/ppl/functions/system.rst deleted file mode 100644 index 698933a3c47..00000000000 --- a/docs/user/ppl/functions/system.rst +++ /dev/null @@ -1,31 +0,0 @@ -================ -System Functions -================ - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 1 - -TYPEOF ------- - -Description ->>>>>>>>>>> - -Usage: typeof(expr) function returns name of the data type of the value that is passed to it. This can be helpful for troubleshooting or dynamically constructing SQL queries. - -Argument type: ANY - -Return type: STRING - -Example:: - - os> source=people | eval `typeof(date)` = typeof(DATE('2008-04-14')), `typeof(int)` = typeof(1), `typeof(now())` = typeof(now()), `typeof(column)` = typeof(accounts) | fields `typeof(date)`, `typeof(int)`, `typeof(now())`, `typeof(column)` - fetched rows / total rows = 1/1 - +--------------+-------------+---------------+----------------+ - | typeof(date) | typeof(int) | typeof(now()) | typeof(column) | - |--------------+-------------+---------------+----------------| - | DATE | INTEGER | TIMESTAMP | OBJECT | - +--------------+-------------+---------------+----------------+ diff --git a/docs/user/ppl/general/comments.md b/docs/user/ppl/general/comments.md new file mode 100644 index 00000000000..224682ee0b4 --- /dev/null +++ b/docs/user/ppl/general/comments.md @@ -0,0 +1,49 @@ +# Comments + +Comments are not evaluated texts. PPL supports both line comments and block comments. +## Line Comments + +Line comments begin with two slashes ( // ) and end with a new line. +Example + +```ppl +source=accounts +| top gender // finds most common gender of all the accounts +``` + +Expected output: + +```text +fetched rows / total rows = 2/2 ++--------+-------+ +| gender | count | +|--------+-------| +| M | 3 | +| F | 1 | ++--------+-------+ +``` + +## Block Comments + +Block comments begin with a slash followed by an asterisk ( /\* ) and end with an asterisk followed by a slash ( \*/ ). +Example + +```ppl +source=accounts +| dedup 2 gender /* dedup the document with gender field keep 2 duplication */ +| fields account_number, gender +``` + +Expected output: + +```text +fetched rows / total rows = 3/3 ++----------------+--------+ +| account_number | gender | +|----------------+--------| +| 13 | F | +| 1 | M | +| 6 | M | ++----------------+--------+ +``` + \ No newline at end of file diff --git a/docs/user/ppl/general/comments.rst b/docs/user/ppl/general/comments.rst deleted file mode 100644 index a0994e970c0..00000000000 --- a/docs/user/ppl/general/comments.rst +++ /dev/null @@ -1,44 +0,0 @@ -======== -Comments -======== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Comments are not evaluated texts. PPL supports both line comments and block comments. - -Line Comments -------------- -Line comments begin with two slashes ( // ) and end with a new line. - -Example:: - - os> source=accounts | top gender // finds most common gender of all the accounts - fetched rows / total rows = 2/2 - +----------+ - | gender | - |----------| - | M | - | F | - +----------+ - -Block Comments --------------- -Block comments begin with a slash followed by an asterisk ( /\* ) and end with an asterisk followed by a slash ( \*/ ). - -Example:: - - os> source=accounts | dedup 2 gender /* dedup the document with gender field keep 2 duplication */ | fields account_number, gender - fetched rows / total rows = 3/3 - +------------------+----------+ - | account_number | gender | - |------------------+----------| - | 1 | M | - | 6 | M | - | 13 | F | - +------------------+----------+ - diff --git a/docs/user/ppl/general/datatypes.md b/docs/user/ppl/general/datatypes.md new file mode 100644 index 00000000000..27fc71155ee --- /dev/null +++ b/docs/user/ppl/general/datatypes.md @@ -0,0 +1,327 @@ +## + +# Data Types + +## Overview + +### PPL Data Types + +The PPL support the following data types. + +| PPL Data Type | +| --- | +| boolean | +| tinyint | +| smallint | +| int | +| bigint | +| float | +| double | +| string | +| timestamp | +| date | +| time | +| interval | +| ip | +| geo_point | +| binary | +| struct | +| array | + +### Data Types Mapping + +The table below list the mapping between OpenSearch Data Type, PPL Data Type and SQL Type. + +| OpenSearch Type | PPL Type | SQL Type | +| --- | --- | --- | +| boolean | boolean | BOOLEAN | +| byte | tinyint | TINYINT | +| short | smallint | SMALLINT | +| integer | int | INTEGER | +| long | bigint | BIGINT | +| float | float | REAL | +| half_float | float | FLOAT | +| scaled_float | float | DOUBLE | +| double | double | DOUBLE | +| keyword | string | VARCHAR | +| text | string | VARCHAR | +| match_only_text | string | VARCHAR | +| date | timestamp | TIMESTAMP | +| ip | ip | VARCHAR | +| binary | binary | VARBINARY | +| object | struct | STRUCT | +| nested | array | STRUCT | + +Notes: Not all the PPL Type has correspond OpenSearch Type. e.g. data and time. To use function which required such data type, user should explicit convert the data type. +## Numeric Data Types + +Numeric values ranged from -2147483648 to +2147483647 are recognized as integer with type name `int`. For others outside the range, `bigint` integer will be the data type after parsed. +## Date and Time Data Types + +The date and time data types are the types that represent temporal values and PPL plugin supports types including DATE, TIME, TIMESTAMP and INTERVAL. By default, the OpenSearch DSL uses date type as the only date and time related type, which has contained all information about an absolute time point. To integrate with PPL language, each of the types other than timestamp is holding part of temporal or timezone information, and the usage to explicitly clarify the date and time types is reflected in the datetime functions (see [Functions](functions.md) for details), where some functions might have restrictions in the input argument type. +### Date + +Date represents the calendar date regardless of the time zone. A given date value represents a 24-hour period, or say a day, but this period varies in different timezones and might have flexible hours during Daylight Savings Time programs. Besides, the date type does not contain time information as well. The supported range is '1000-01-01' to '9999-12-31'. + +| Type | Syntax | Range | +| --- | --- | --- | +| Date | 'yyyy-MM-dd' | '0001-01-01' to '9999-12-31' | + +### Time + +Time represents the time on the clock or watch with no regard for which timezone it might be related with. Time type data does not have date information. + +| Type | Syntax | Range | +| --- | --- | --- | +| Time | 'hh:mm:ss[.fraction]' | '00:00:00.000000' to '23:59:59.999999' | + +### Timestamp + +A timestamp instance is an absolute instant independent of timezone or convention. For example, for a given point of time, if we set the timestamp of this time point into another timezone, the value should also be different accordingly. Besides, the storage of timestamp type is also different from the other types. The timestamp is converted from the current timezone to UTC for storage, and is converted back to the set timezone from UTC when retrieving. + +| Type | Syntax | Range | +| --- | --- | --- | +| Timestamp | 'yyyy-MM-dd hh:mm:ss[.fraction]' | '0001-01-01 00:00:01.000000' UTC to '9999-12-31 23:59:59.999999' | + +### Interval + +Interval data type represents a temporal duration or a period. The syntax is as follows: + +| Type | Syntax | +| --- | --- | +| Interval | INTERVAL expr unit | + +The expr is any expression that can be iterated to a quantity value eventually, see [Expressions](expressions.md) for details. The unit represents the unit for interpreting the quantity, including MICROSECOND, SECOND, MINUTE, HOUR, DAY, WEEK, MONTH, QUARTER and YEAR.The INTERVAL keyword and the unit specifier are not case sensitive. Note that there are two classes of intervals. Year-week intervals can store years, quarters, months and weeks. Day-time intervals can store days, hours, minutes, seconds and microseconds. Year-week intervals are comparable only with another year-week intervals. These two types of intervals can only comparable with the same type of themselves. +### Conversion between date and time types + +Basically the date and time types except interval can be converted to each other, but might suffer some alteration of the value or some information loss, for example extracting the time value from a timestamp value, or convert a date value to a timestamp value and so forth. Here lists the summary of the conversion rules that PPL plugin supports for each of the types: +#### Conversion from DATE + +- Since the date value does not have any time information, conversion to [Time](#time) type is not useful, and will always return a zero time value '00:00:00'. +- Conversion to timestamp is to alternate both the time value and the timezone information, and it attaches the zero time value '00:00:00' and the session timezone (UTC by default) to the date. For example, the result to covert date '2020-08-17' to timestamp type with session timezone UTC is timestamp '2020-08-17 00:00:00' UTC. + +#### Conversion from TIME + +- Time value cannot be converted to any other date and time types since it does not contain any date information, so it is not meaningful to give no date info to a date/timestamp instance. + +#### Conversion from TIMESTAMP + +- Conversion from timestamp is much more straightforward. To convert it to date is to extract the date value, and conversion to time is to extract the time value. For example, the result to convert timestamp '2020-08-17 14:09:00' UTC to date is date '2020-08-17', to time is '14:09:00'. + +## String Data Types + +A string is a sequence of characters enclosed in either single or double quotes. For example, both 'text' and "text" will be treated as string literal. +## Query Struct Data Types + +In PPL, the Struct Data Types corresponding to the [Object field type in OpenSearch](https://opensearch.org/docs/latest/field-types/supported-field-types/object/). The "." is used as the path selector when access the inner attribute of the struct data. +### Example: People + +There are three fields in test index `people`: 1) deep nested object field `city`; 2) object field of array value `account`; 3) nested field `projects` + +```bash +{ + "mappings": { + "properties": { + "city": { + "properties": { + "name": { + "type": "keyword" + }, + "location": { + "properties": { + "latitude": { + "type": "double" + } + } + } + } + }, + "account": { + "properties": { + "id": { + "type": "keyword" + } + } + }, + "projects": { + "type": "nested", + "properties": { + "name": { + "type": "keyword" + } + } + } + } + } +} + +``` + +### Example: Employees + +Here is the mapping for test index `employees_nested`. Note that field `projects` is a nested field + +```bash +{ + "mappings": { + "properties": { + "id": { + "type": "long" + }, + "name": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "projects": { + "type": "nested", + "properties": { + "name": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "fielddata": true + }, + "started_year": { + "type": "long" + } + } + }, + "title": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + } + } + } +} + + +``` + +```bash +{ + "employees_nested" : [ + { + "id" : 3, + "name" : "Bob Smith", + "title" : null, + "projects" : [ + { + "name" : "AWS Redshift Spectrum querying", + "started_year" : 1990 + }, + { + "name" : "AWS Redshift security", + "started_year" : 1999 + }, + { + "name" : "AWS Aurora security", + "started_year" : 2015 + } + ] + }, + { + "id" : 4, + "name" : "Susan Smith", + "title" : "Dev Mgr", + "projects" : [ ] + }, + { + "id" : 6, + "name" : "Jane Smith", + "title" : "Software Eng 2", + "projects" : [ + { + "name" : "AWS Redshift security", + "started_year" : 1998 + }, + { + "name" : "AWS Hello security", + "started_year" : 2015, + "address" : [ + { + "city" : "Dallas", + "state" : "TX" + } + ] + } + ] + } + ] +} + + +``` + +### Example 1: Select struct inner attribute + +The example show fetch city (top level), city.name (second level), city.location.latitude (deeper level) struct type data from people results. + +```ppl +source=people +| fields city, city.name, city.location.latitude +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------------------------------------+-----------+------------------------+ +| city | city.name | city.location.latitude | +|-----------------------------------------------------+-----------+------------------------| +| {'name': 'Seattle', 'location': {'latitude': 10.5}} | Seattle | 10.5 | ++-----------------------------------------------------+-----------+------------------------+ +``` + +### Example 2: Group by struct inner attribute + +The example show group by object field inner attribute. + +```ppl +source=people +| stats count() by city.name +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------+-----------+ +| count() | city.name | +|---------+-----------| +| 1 | Seattle | ++---------+-----------+ +``` + +### Example 3: Selecting Field of Array Value + +Select deeper level for object fields of array value which returns the first element in the array. For example, because inner field `accounts.id` has three values instead of a tuple in this document, the first entry is returned. + +```ppl +source = people +| fields accounts, accounts.id +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++-----------------------+-------------+ +| accounts | accounts.id | +|-----------------------+-------------| +| [{'id': 1},{'id': 2}] | 1 | ++-----------------------+-------------+ +``` + \ No newline at end of file diff --git a/docs/user/ppl/general/datatypes.rst b/docs/user/ppl/general/datatypes.rst deleted file mode 100644 index 1f73ca3531a..00000000000 --- a/docs/user/ppl/general/datatypes.rst +++ /dev/null @@ -1,392 +0,0 @@ - -========== -Data Types -========== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Overview -======== - -PPL Data Types -------------------- - -The PPL support the following data types. - -+---------------+ -| PPL Data Type | -+===============+ -| boolean | -+---------------+ -| tinyint | -+---------------+ -| smallint | -+---------------+ -| int | -+---------------+ -| bigint | -+---------------+ -| float | -+---------------+ -| double | -+---------------+ -| string | -+---------------+ -| timestamp | -+---------------+ -| date | -+---------------+ -| time | -+---------------+ -| interval | -+---------------+ -| ip | -+---------------+ -| geo_point | -+---------------+ -| binary | -+---------------+ -| struct | -+---------------+ -| array | -+---------------+ - -Data Types Mapping ------------------- - -The table below list the mapping between OpenSearch Data Type, PPL Data Type and SQL Type. - -+-----------------+---------------+-----------+ -| OpenSearch Type | PPL Type | SQL Type | -+=================+===============+===========+ -| boolean | boolean | BOOLEAN | -+-----------------+---------------+-----------+ -| byte | tinyint | TINYINT | -+-----------------+---------------+-----------+ -| short | smallint | SMALLINT | -+-----------------+---------------+-----------+ -| integer | int | INTEGER | -+-----------------+---------------+-----------+ -| long | bigint | BIGINT | -+-----------------+---------------+-----------+ -| float | float | REAL | -+-----------------+---------------+-----------+ -| half_float | float | FLOAT | -+-----------------+---------------+-----------+ -| scaled_float | float | DOUBLE | -+-----------------+---------------+-----------+ -| double | double | DOUBLE | -+-----------------+---------------+-----------+ -| keyword | string | VARCHAR | -+-----------------+---------------+-----------+ -| text | string | VARCHAR | -+-----------------+---------------+-----------+ -| match_only_text | string | VARCHAR | -+-----------------+---------------+-----------+ -| date | timestamp | TIMESTAMP | -+-----------------+---------------+-----------+ -| ip | ip | VARCHAR | -+-----------------+---------------+-----------+ -| binary | binary | VARBINARY | -+-----------------+---------------+-----------+ -| object | struct | STRUCT | -+-----------------+---------------+-----------+ -| nested | array | STRUCT | -+-----------------+---------------+-----------+ - -Notes: Not all the PPL Type has correspond OpenSearch Type. e.g. data and time. To use function which required such data type, user should explicit convert the data type. - - - -Numeric Data Types -================== - -Numeric values ranged from -2147483648 to +2147483647 are recognized as integer with type name ``int``. For others outside the range, ``bigint`` integer will be the data type after parsed. - - -Date and Time Data Types -======================== - -The date and time data types are the types that represent temporal values and PPL plugin supports types including DATE, TIME, TIMESTAMP and INTERVAL. By default, the OpenSearch DSL uses date type as the only date and time related type, which has contained all information about an absolute time point. To integrate with PPL language, each of the types other than timestamp is holding part of temporal or timezone information, and the usage to explicitly clarify the date and time types is reflected in the datetime functions (see `Functions `_ for details), where some functions might have restrictions in the input argument type. - - -Date ----- - -Date represents the calendar date regardless of the time zone. A given date value represents a 24-hour period, or say a day, but this period varies in different timezones and might have flexible hours during Daylight Savings Time programs. Besides, the date type does not contain time information as well. The supported range is '1000-01-01' to '9999-12-31'. - -+------+--------------+------------------------------+ -| Type | Syntax | Range | -+======+==============+==============================+ -| Date | 'yyyy-MM-dd' | '0001-01-01' to '9999-12-31' | -+------+--------------+------------------------------+ - - -Time ----- - -Time represents the time on the clock or watch with no regard for which timezone it might be related with. Time type data does not have date information. - -+------+-----------------------+----------------------------------------+ -| Type | Syntax | Range | -+======+=======================+========================================+ -| Time | 'hh:mm:ss[.fraction]' | '00:00:00.000000' to '23:59:59.999999' | -+------+-----------------------+----------------------------------------+ - - -Timestamp ---------- - -A timestamp instance is an absolute instant independent of timezone or convention. For example, for a given point of time, if we set the timestamp of this time point into another timezone, the value should also be different accordingly. Besides, the storage of timestamp type is also different from the other types. The timestamp is converted from the current timezone to UTC for storage, and is converted back to the set timezone from UTC when retrieving. - -+-----------+----------------------------------+------------------------------------------------------------------+ -| Type | Syntax | Range | -+===========+==================================+==================================================================+ -| Timestamp | 'yyyy-MM-dd hh:mm:ss[.fraction]' | '0001-01-01 00:00:01.000000' UTC to '9999-12-31 23:59:59.999999' | -+-----------+----------------------------------+------------------------------------------------------------------+ - - -Interval --------- - -Interval data type represents a temporal duration or a period. The syntax is as follows: - -+----------+--------------------+ -| Type | Syntax | -+==========+====================+ -| Interval | INTERVAL expr unit | -+----------+--------------------+ - -The expr is any expression that can be iterated to a quantity value eventually, see `Expressions `_ for details. The unit represents the unit for interpreting the quantity, including MICROSECOND, SECOND, MINUTE, HOUR, DAY, WEEK, MONTH, QUARTER and YEAR.The INTERVAL keyword and the unit specifier are not case sensitive. Note that there are two classes of intervals. Year-week intervals can store years, quarters, months and weeks. Day-time intervals can store days, hours, minutes, seconds and microseconds. Year-week intervals are comparable only with another year-week intervals. These two types of intervals can only comparable with the same type of themselves. - - -Conversion between date and time types --------------------------------------- - -Basically the date and time types except interval can be converted to each other, but might suffer some alteration of the value or some information loss, for example extracting the time value from a timestamp value, or convert a date value to a timestamp value and so forth. Here lists the summary of the conversion rules that PPL plugin supports for each of the types: - -Conversion from DATE ->>>>>>>>>>>>>>>>>>>> - -- Since the date value does not have any time information, conversion to `Time`_ type is not useful, and will always return a zero time value '00:00:00'. - -- Conversion to timestamp is to alternate both the time value and the timezone information, and it attaches the zero time value '00:00:00' and the session timezone (UTC by default) to the date. For example, the result to covert date '2020-08-17' to timestamp type with session timezone UTC is timestamp '2020-08-17 00:00:00' UTC. - - -Conversion from TIME ->>>>>>>>>>>>>>>>>>>> - -- Time value cannot be converted to any other date and time types since it does not contain any date information, so it is not meaningful to give no date info to a date/timestamp instance. - - -Conversion from TIMESTAMP ->>>>>>>>>>>>>>>>>>>>>>>>> - -- Conversion from timestamp is much more straightforward. To convert it to date is to extract the date value, and conversion to time is to extract the time value. For example, the result to convert timestamp '2020-08-17 14:09:00' UTC to date is date '2020-08-17', to time is '14:09:00'. - - -String Data Types -================= - -A string is a sequence of characters enclosed in either single or double quotes. For example, both 'text' and "text" will be treated as string literal. - - -Query Struct Data Types -======================= - -In PPL, the Struct Data Types corresponding to the `Object field type in OpenSearch `_. The "." is used as the path selector when access the inner attribute of the struct data. - -Example: People ---------------- - -There are three fields in test index ``people``: 1) deep nested object field ``city``; 2) object field of array value ``account``; 3) nested field ``projects``:: - - { - "mappings": { - "properties": { - "city": { - "properties": { - "name": { - "type": "keyword" - }, - "location": { - "properties": { - "latitude": { - "type": "double" - } - } - } - } - }, - "account": { - "properties": { - "id": { - "type": "keyword" - } - } - }, - "projects": { - "type": "nested", - "properties": { - "name": { - "type": "keyword" - } - } - } - } - } - } - -Example: Employees ------------------- - -Here is the mapping for test index ``employees_nested``. Note that field ``projects`` is a nested field:: - - { - "mappings": { - "properties": { - "id": { - "type": "long" - }, - "name": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "projects": { - "type": "nested", - "properties": { - "name": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword" - } - }, - "fielddata": true - }, - "started_year": { - "type": "long" - } - } - }, - "title": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - } - } - } - } - - -Result set:: - - { - "employees_nested" : [ - { - "id" : 3, - "name" : "Bob Smith", - "title" : null, - "projects" : [ - { - "name" : "AWS Redshift Spectrum querying", - "started_year" : 1990 - }, - { - "name" : "AWS Redshift security", - "started_year" : 1999 - }, - { - "name" : "AWS Aurora security", - "started_year" : 2015 - } - ] - }, - { - "id" : 4, - "name" : "Susan Smith", - "title" : "Dev Mgr", - "projects" : [ ] - }, - { - "id" : 6, - "name" : "Jane Smith", - "title" : "Software Eng 2", - "projects" : [ - { - "name" : "AWS Redshift security", - "started_year" : 1998 - }, - { - "name" : "AWS Hello security", - "started_year" : 2015, - "address" : [ - { - "city" : "Dallas", - "state" : "TX" - } - ] - } - ] - } - ] - } - - -Example 1: Select struct inner attribute ----------------------------------------- - -The example show fetch city (top level), city.name (second level), city.location.latitude (deeper level) struct type data from people results. - -PPL query:: - - os> source=people | fields city, city.name, city.location.latitude; - fetched rows / total rows = 1/1 - +-----------------------------------------------------+-----------+------------------------+ - | city | city.name | city.location.latitude | - |-----------------------------------------------------+-----------+------------------------| - | {'name': 'Seattle', 'location': {'latitude': 10.5}} | Seattle | 10.5 | - +-----------------------------------------------------+-----------+------------------------+ - - -Example 2: Group by struct inner attribute ------------------------------------------- - -The example show group by object field inner attribute. - -PPL query:: - - os> source=people | stats count() by city.name; - fetched rows / total rows = 1/1 - +---------+-----------+ - | count() | city.name | - |---------+-----------| - | 1 | Seattle | - +---------+-----------+ - -Example 3: Selecting Field of Array Value ------------------------------------------ - -Select deeper level for object fields of array value which returns the first element in the array. For example, because inner field ``accounts.id`` has three values instead of a tuple in this document, the first entry is returned.:: - - os> source = people | fields accounts, accounts.id; - fetched rows / total rows = 1/1 - +-----------------------+-------------+ - | accounts | accounts.id | - |-----------------------+-------------| - | [{'id': 1},{'id': 2}] | 1 | - +-----------------------+-------------+ diff --git a/docs/user/ppl/general/identifiers.md b/docs/user/ppl/general/identifiers.md new file mode 100644 index 00000000000..c532e9929f3 --- /dev/null +++ b/docs/user/ppl/general/identifiers.md @@ -0,0 +1,188 @@ +# Identifiers + +## Introduction + +Identifiers are used for naming your database objects, such as index name, field name, alias etc. Basically there are two types of identifiers: regular identifiers and delimited identifiers. +## Regular Identifiers + +### Description + +A regular identifier is a string of characters that must start with ASCII letter (lower or upper case). The subsequent character can be a combination of letter, digit, underscore (`_`). It cannot be a reversed key word. And whitespace and other special characters are not allowed. +For OpenSearch, the following identifiers are supported extensionally: +1. Identifiers prefixed by dot `.`: this is called hidden index in OpenSearch, for example `.opensearch_dashboards`. +2. Identifiers prefixed by at sign `@`: this is common for meta fields generated in Logstash ingestion. +3. Identifiers with `-` in the middle: this is mostly the case for index name with date information. +4. Identifiers with star `*` present: this is mostly an index pattern for wildcard match. + +Index name with date suffix separated by dash or dots, such as `cwl-2020.01.11` or `logs-7.0-2020.01.11`, is common for those created by Logstash or FileBeat ingestion. So, this kind of identifier used as index name is also supported without the need of being quoted for user convenience. In this case, wildcard within date pattern is also allowed to search for data across indices of different date range. For example, you can use `logs-2020.1*` to search in indices for October, November and December 2020. +### Examples + +Here are examples for using index pattern directly without quotes + +```ppl +source=accounts +| fields account_number, firstname, lastname +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+-----------+----------+ +| account_number | firstname | lastname | +|----------------+-----------+----------| +| 1 | Amber | Duke | +| 6 | Hattie | Bond | +| 13 | Nanette | Bates | +| 18 | Dale | Adams | ++----------------+-----------+----------+ +``` + +## Delimited Identifiers + +### Description + +A delimited identifier is an identifier enclosed in back ticks `` ` ``. In this case, the identifier enclosed is not necessarily a regular identifier. In other words, it can contain any special character not allowed by regular identifier. +### Use Cases + +Here are typical examples of the use of delimited identifiers: +1. Identifiers of reserved key word name +2. Identifiers with dot `.` present: similarly as `-` in index name to include date information, it is required to be quoted so parser can differentiate it from identifier with qualifiers. +3. Identifiers with other special character: OpenSearch has its own rule which allows more special character, for example Unicode character is supported in index name. + +### Examples + +Here are examples for quoting an index name by back ticks + +```ppl +source=`accounts` +| fields `account_number` +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++----------------+ +| account_number | +|----------------| +| 1 | +| 6 | +| 13 | +| 18 | ++----------------+ +``` + +## Cross-Cluster Index Identifiers + +### Description + +A cross-cluster index identifier is an index identifier with a prefix `:`. The cluster identifier could contain star `*`. This is mostly an cluster pattern for wildcard match. +### Use Cases + +It is used to identify an index on a remote cluster for cross-cluster search. +### Examples + +For example, if you setup a connection between the local cluster and a remote cluster `my_cluster`, then you can run `source=my_cluster:accounts` to query the `accounts` index at `my_cluster`. +## Case Sensitivity + +### Description + +Identifiers are treated in case sensitive manner. So it must be exactly same as what is stored in OpenSearch. +### Examples + +For example, if you run `source=Accounts`, it will end up with an index not found exception from our plugin because the actual index name is under lower case. +## Multiple Indices + +### Description + +To query multiple indices, you could +1. Include `*` in index name, this is an index pattern for wildcard match. +2. Include multiple indices and seperated them by `,`. +3. Delimited multiple indices and seperated them by `,`. Note: no space allowed between each index. + +### Examples + +Query wildcard indices + +```ppl +source=acc* +| stats count() +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------+ +| count() | +|---------| +| 5 | ++---------+ +``` + +Query multiple indices seperated by `,` + +```ppl +source=accounts, account2 +| stats count() +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------+ +| count() | +|---------| +| 5 | ++---------+ +``` + +Query delimited multiple indices seperated by `,` + +```ppl +source=`accounts,account2` +| stats count() +``` + +Expected output: + +```text +fetched rows / total rows = 1/1 ++---------+ +| count() | +|---------| +| 5 | ++---------+ +``` + +## Metadata Identifiers + +### Description + +One can also provide meta-field name(s) to retrieve reserved-fields (beginning with underscore) from OpenSearch documents. Meta-fields are not output +as default field list (`search source=`) and must be explicitly included to be returned. +### Examples + +Query metadata fields: + +```ppl +source=accounts +| fields firstname, lastname, _index, _sort +``` + +Expected output: + +```text +fetched rows / total rows = 4/4 ++-----------+----------+----------+-------+ +| firstname | lastname | _index | _sort | +|-----------+----------+----------+-------| +| Amber | Duke | accounts | -2 | +| Hattie | Bond | accounts | -2 | +| Nanette | Bates | accounts | -2 | +| Dale | Adams | accounts | -2 | ++-----------+----------+----------+-------+ +``` + \ No newline at end of file diff --git a/docs/user/ppl/general/identifiers.rst b/docs/user/ppl/general/identifiers.rst deleted file mode 100644 index af4e81514c8..00000000000 --- a/docs/user/ppl/general/identifiers.rst +++ /dev/null @@ -1,188 +0,0 @@ -=========== -Identifiers -=========== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - - -Introduction -============ - -Identifiers are used for naming your database objects, such as index name, field name, alias etc. Basically there are two types of identifiers: regular identifiers and delimited identifiers. - - -Regular Identifiers -=================== - -Description ------------ - -A regular identifier is a string of characters that must start with ASCII letter (lower or upper case). The subsequent character can be a combination of letter, digit, underscore (``_``). It cannot be a reversed key word. And whitespace and other special characters are not allowed. - -For OpenSearch, the following identifiers are supported extensionally: - -1. Identifiers prefixed by dot ``.``: this is called hidden index in OpenSearch, for example ``.opensearch_dashboards``. -2. Identifiers prefixed by at sign ``@``: this is common for meta fields generated in Logstash ingestion. -3. Identifiers with ``-`` in the middle: this is mostly the case for index name with date information. -4. Identifiers with star ``*`` present: this is mostly an index pattern for wildcard match. - -Index name with date suffix separated by dash or dots, such as ``cwl-2020.01.11`` or ``logs-7.0-2020.01.11``, is common for those created by Logstash or FileBeat ingestion. So, this kind of identifier used as index name is also supported without the need of being quoted for user convenience. In this case, wildcard within date pattern is also allowed to search for data across indices of different date range. For example, you can use ``logs-2020.1*`` to search in indices for October, November and December 2020. - -Examples --------- - -Here are examples for using index pattern directly without quotes:: - - os> source=accounts | fields account_number, firstname, lastname; - fetched rows / total rows = 4/4 - +----------------+-----------+----------+ - | account_number | firstname | lastname | - |----------------+-----------+----------| - | 1 | Amber | Duke | - | 6 | Hattie | Bond | - | 13 | Nanette | Bates | - | 18 | Dale | Adams | - +----------------+-----------+----------+ - - -Delimited Identifiers -===================== - -Description ------------ - -A delimited identifier is an identifier enclosed in back ticks `````. In this case, the identifier enclosed is not necessarily a regular identifier. In other words, it can contain any special character not allowed by regular identifier. - -Use Cases ---------- - -Here are typical examples of the use of delimited identifiers: - -1. Identifiers of reserved key word name -2. Identifiers with dot ``.`` present: similarly as ``-`` in index name to include date information, it is required to be quoted so parser can differentiate it from identifier with qualifiers. -3. Identifiers with other special character: OpenSearch has its own rule which allows more special character, for example Unicode character is supported in index name. - -Examples --------- - -Here are examples for quoting an index name by back ticks:: - - os> source=`accounts` | fields `account_number`; - fetched rows / total rows = 4/4 - +----------------+ - | account_number | - |----------------| - | 1 | - | 6 | - | 13 | - | 18 | - +----------------+ - - -Cross-Cluster Index Identifiers -=================== - -Description ------------ - -A cross-cluster index identifier is an index identifier with a prefix ``:``. The cluster identifier could contain star ``*``. This is mostly an cluster pattern for wildcard match. - -Use Cases ---------- - -It is used to identify an index on a remote cluster for cross-cluster search. - -Examples --------- - -For example, if you setup a connection between the local cluster and a remote cluster ``my_cluster``, then you can run ``source=my_cluster:accounts`` to query the ``accounts`` index at ``my_cluster``. - - -Case Sensitivity -================ - -Description ------------ - -Identifiers are treated in case sensitive manner. So it must be exactly same as what is stored in OpenSearch. - -Examples --------- - -For example, if you run ``source=Accounts``, it will end up with an index not found exception from our plugin because the actual index name is under lower case. - -Multiple Indices -================ - -Description ------------ - -To query multiple indices, you could - -1. Include ``*`` in index name, this is an index pattern for wildcard match. -2. Include multiple indices and seperated them by ``,``. -3. Delimited multiple indices and seperated them by ``,``. Note: no space allowed between each index. - - -Examples ---------- - -Query wildcard indices:: - - os> source=acc* | stats count(); - fetched rows / total rows = 1/1 - +---------+ - | count() | - |---------| - | 5 | - +---------+ - -Query multiple indices seperated by ``,``:: - - os> source=accounts, account2 | stats count(); - fetched rows / total rows = 1/1 - +---------+ - | count() | - |---------| - | 5 | - +---------+ - -Query delimited multiple indices seperated by ``,``:: - - os> source=`accounts,account2` | stats count(); - fetched rows / total rows = 1/1 - +---------+ - | count() | - |---------| - | 5 | - +---------+ - -Metadata Identifiers -==================== - -Description ------------ - -One can also provide meta-field name(s) to retrieve reserved-fields (beginning with underscore) from OpenSearch documents. Meta-fields are not output -as default field list (`search source=`) and must be explicitly included to be returned. - -Examples ---------- - -Query metadata fields:: - - os> source=accounts | fields firstname, lastname, _index, _sort; - fetched rows / total rows = 4/4 - +-----------+----------+----------+-------+ - | firstname | lastname | _index | _sort | - |-----------+----------+----------+-------| - | Amber | Duke | accounts | -2 | - | Hattie | Bond | accounts | -2 | - | Nanette | Bates | accounts | -2 | - | Dale | Adams | accounts | -2 | - +-----------+----------+----------+-------+ - diff --git a/docs/user/ppl/index.md b/docs/user/ppl/index.md new file mode 100644 index 00000000000..a8fcb5a480b --- /dev/null +++ b/docs/user/ppl/index.md @@ -0,0 +1,100 @@ +# OpenSearch PPL Reference Manual + +### Overview + +Piped Processing Language (PPL), powered by OpenSearch, enables OpenSearch users with exploration and discovery of, and finding search patterns in data stored in OpenSearch, using a set of commands delimited by pipes (\|). These are essentially read-only requests to process data and return results. + +Currently, OpenSearch users can query data using either Query DSL or SQL. Query DSL is powerful and fast. However, it has a steep learning curve, and was not designed as a human interface to easily create ad hoc queries and explore user data. SQL allows users to extract and analyze data in OpenSearch in a declarative manner. OpenSearch now makes its search and query engine robust by introducing Piped Processing Language (PPL). It enables users to extract insights from OpenSearch with a sequence of commands delimited by pipes (\|). It supports a comprehensive set of commands including search, where, fields, rename, dedup, sort, eval, head, top and rare, and functions, operators and expressions. Even new users who have recently adopted OpenSearch, can be productive day one, if they are familiar with the pipe (\|) syntax. It enables developers, DevOps engineers, support engineers, site reliability engineers (SREs), and IT managers to effectively discover and explore log, monitoring and observability data stored in OpenSearch. + +We expand the capabilities of our Workbench, a comprehensive and integrated visual query tool currently supporting only SQL, to run on-demand PPL commands, and view and save results as text and JSON. We also add a new interactive standalone command line tool, the PPL CLI, to run on-demand PPL commands, and view and save results as text and JSON. +The query start with search command and then flowing a set of command delimited by pipe (\|). + +for example, the following query retrieve firstname and lastname from accounts if age large than 18. + +``` +source=accounts +| where age > 18 +| fields firstname, lastname +``` + +* **Interfaces** + - [Endpoint](interfaces/endpoint.md) + - [Protocol](interfaces/protocol.md) +* **Administration** + - [Plugin Settings](admin/settings.md) + - [Security Settings](admin/security.md) + - [Monitoring](admin/monitoring.md) + - [Datasource Settings](admin/datasources.md) + - [Prometheus Connector](admin/connectors/prometheus_connector.md) + - [Cross-Cluster Search](admin/cross_cluster_search.md) +* **Language Structure** + - [Identifiers](general/identifiers.md) + - [Data Types](general/datatypes.md) +* **Commands** + + The following commands are available in PPL: + **Note:** Experimental commands are ready for use, but specific parameters may change based on feedback. + +| Command Name | Version Introduced | Current Status | Command Description | +| --- | --- | --- | --- | +| [search command](cmd/search.md) | 1.0 | stable (since 1.0) | Retrieve documents from the index. | +| [where command](cmd/where.md) | 1.0 | stable (since 1.0) | Filter the search result using boolean expressions. | +| [subquery command](cmd/subquery.md) | 3.0 | experimental (since 3.0) | Embed one PPL query inside another for complex filtering and data retrieval operations. | +| [fields command](cmd/fields.md) | 1.0 | stable (since 1.0) | Keep or remove fields from the search result. | +| [rename command](cmd/rename.md) | 1.0 | stable (since 1.0) | Rename one or more fields in the search result. | +| [eval command](cmd/eval.md) | 1.0 | stable (since 1.0) | Evaluate an expression and append the result to the search result. | +| [replace command](cmd/replace.md) | 3.4 | experimental (since 3.4) | Replace text in one or more fields in the search result | +| [fillnull command](cmd/fillnull.md) | 3.0 | experimental (since 3.0) | Fill null with provided value in one or more fields in the search result. | +| [expand command](cmd/expand.md) | 3.1 | experimental (since 3.1) | Transform a single document into multiple documents by expanding a nested array field. | +| [flatten command](cmd/flatten.md) | 3.1 | experimental (since 3.1) | Flatten a struct or an object field into separate fields in a document. | +| [table command](cmd/table.md) | 3.3 | experimental (since 3.3) | Keep or remove fields from the search result using enhanced syntax options. | +| [stats command](cmd/stats.md) | 1.0 | stable (since 1.0) | Calculate aggregation from search results. | +| [eventstats command](cmd/eventstats.md) | 3.1 | experimental (since 3.1) | Calculate aggregation statistics and add them as new fields to each event. | +| [streamstats command](cmd/streamstats.md) | 3.4 | experimental (since 3.4) | Calculate cumulative or rolling statistics as events are processed in order. | +| [bin command](cmd/bin.md) | 3.3 | experimental (since 3.3) | Group numeric values into buckets of equal intervals. | +| [timechart command](cmd/timechart.md) | 3.3 | experimental (since 3.3) | Create time-based charts and visualizations. | +| [chart command](cmd/chart.md) | 3.4 | experimental (since 3.4) | Apply statistical aggregations to search results and group the data for visualizations. | +| [trendline command](cmd/trendline.md) | 3.0 | experimental (since 3.0) | Calculate moving averages of fields. | +| [sort command](cmd/sort.md) | 1.0 | stable (since 1.0) | Sort all the search results by the specified fields. | +| [reverse command](cmd/reverse.md) | 3.2 | experimental (since 3.2) | Reverse the display order of search results. | +| [head command](cmd/head.md) | 1.0 | stable (since 1.0) | Return the first N number of specified results after an optional offset in search order. | +| [dedup command](cmd/dedup.md) | 1.0 | stable (since 1.0) | Remove identical documents defined by the field from the search result. | +| [top command](cmd/top.md) | 1.0 | stable (since 1.0) | Find the most common tuple of values of all fields in the field list. | +| [rare command](cmd/rare.md) | 1.0 | stable (since 1.0) | Find the least common tuple of values of all fields in the field list. | +| [parse command](cmd/parse.md) | 1.3 | stable (since 1.3) | Parse a text field with a regular expression and append the result to the search result. | +| [grok command](cmd/grok.md) | 2.4 | stable (since 2.4) | Parse a text field with a grok pattern and append the results to the search result. | +| [rex command](cmd/rex.md) | 3.3 | experimental (since 3.3) | Extract fields from a raw text field using regular expression named capture groups. | +| [regex command](cmd/regex.md) | 3.3 | experimental (since 3.3) | Filter search results by matching field values against a regular expression pattern. | +| [spath command](cmd/spath.md) | 3.3 | experimental (since 3.3) | Extract fields from structured text data. | +| [patterns command](cmd/patterns.md) | 2.4 | stable (since 2.4) | Extract log patterns from a text field and append the results to the search result. | +| [join command](cmd/join.md) | 3.0 | stable (since 3.0) | Combine two datasets together. | +| [append command](cmd/append.md) | 3.3 | experimental (since 3.3) | Append the result of a sub-search to the bottom of the input search results. | +| [appendcol command](cmd/appendcol.md) | 3.1 | experimental (since 3.1) | Append the result of a sub-search and attach it alongside the input search results. | +| [lookup command](cmd/lookup.md) | 3.0 | experimental (since 3.0) | Add or replace data from a lookup index. | +| [multisearch command](cmd/multisearch.md) | 3.4 | experimental (since 3.4) | Execute multiple search queries and combine their results. | +| [ml command](cmd/ml.md) | 2.5 | stable (since 2.5) | Apply machine learning algorithms to analyze data. | +| [kmeans command](cmd/kmeans.md) | 1.3 | stable (since 1.3) | Apply the kmeans algorithm on the search result returned by a PPL command. | +| [ad command](cmd/ad.md) | 1.3 | deprecated (since 2.5) | Apply Random Cut Forest algorithm on the search result returned by a PPL command. | +| [describe command](cmd/describe.md) | 2.1 | stable (since 2.1) | Query the metadata of an index. | +| [explain command](cmd/explain.md) | 3.1 | stable (since 3.1) | Explain the plan of query. | +| [show datasources command](cmd/showdatasources.md) | 2.4 | stable (since 2.4) | Query datasources configured in the PPL engine. | + + - [Syntax](cmd/syntax.md) - PPL query structure and command syntax formatting +* **Functions** + - [Aggregation Functions](functions/aggregations.md) + - [Collection Functions](functions/collection.md) + - [Condition Functions](functions/condition.md) + - [Cryptographic Functions](functions/cryptographic.md) + - [Date and Time Functions](functions/datetime.md) + - [Expressions](functions/expressions.md) + - [IP Address Functions](functions/ip.md) + - [JSON Functions](functions/json.md) + - [Math Functions](functions/math.md) + - [Relevance Functions](functions/relevance.md) + - [String Functions](functions/string.md) + - [System Functions](functions/system.md) + - [Type Conversion Functions](functions/conversion.md) +* **Optimization** + - [Optimization](../../user/optimization/optimization.rst) +* **Limitations** + - [Limitations](limitations/limitations.md) \ No newline at end of file diff --git a/docs/user/ppl/index.rst b/docs/user/ppl/index.rst deleted file mode 100644 index 981b2de3169..00000000000 --- a/docs/user/ppl/index.rst +++ /dev/null @@ -1,137 +0,0 @@ - -=============================== -OpenSearch PPL Reference Manual -=============================== - -Overview ---------- -Piped Processing Language (PPL), powered by OpenSearch, enables OpenSearch users with exploration and discovery of, and finding search patterns in data stored in OpenSearch, using a set of commands delimited by pipes (|). These are essentially read-only requests to process data and return results. - -Currently, OpenSearch users can query data using either Query DSL or SQL. Query DSL is powerful and fast. However, it has a steep learning curve, and was not designed as a human interface to easily create ad hoc queries and explore user data. SQL allows users to extract and analyze data in OpenSearch in a declarative manner. OpenSearch now makes its search and query engine robust by introducing Piped Processing Language (PPL). It enables users to extract insights from OpenSearch with a sequence of commands delimited by pipes (|). It supports a comprehensive set of commands including search, where, fields, rename, dedup, sort, eval, head, top and rare, and functions, operators and expressions. Even new users who have recently adopted OpenSearch, can be productive day one, if they are familiar with the pipe (|) syntax. It enables developers, DevOps engineers, support engineers, site reliability engineers (SREs), and IT managers to effectively discover and explore log, monitoring and observability data stored in OpenSearch. - -We expand the capabilities of our Workbench, a comprehensive and integrated visual query tool currently supporting only SQL, to run on-demand PPL commands, and view and save results as text and JSON. We also add a new interactive standalone command line tool, the PPL CLI, to run on-demand PPL commands, and view and save results as text and JSON. - -The query start with search command and then flowing a set of command delimited by pipe (|). -| for example, the following query retrieve firstname and lastname from accounts if age large than 18. - -.. code-block:: - - source=accounts - | where age > 18 - | fields firstname, lastname - -* **Interfaces** - - - `Endpoint `_ - - - `Protocol `_ - -* **Administration** - - - `Plugin Settings `_ - - - `Security Settings `_ - - - `Monitoring `_ - - - `Datasource Settings `_ - - - `Prometheus Connector `_ - - - `Cross-Cluster Search `_ - -* **Language Structure** - - - `Identifiers `_ - - - `Data Types `_ - -* **Commands** - - The following commands are available in PPL: - - **Note:** Experimental commands are ready for use, but specific parameters may change based on feedback. - - ============================================================== ================== ======================== ============================================================================================== - Command Name Version Introduced Current Status Command Description - ============================================================== ================== ======================== ============================================================================================== - `search command `_ 1.0 stable (since 1.0) Retrieve documents from the index. - `where command `_ 1.0 stable (since 1.0) Filter the search result using boolean expressions. - `subquery command `_ 3.0 experimental (since 3.0) Embed one PPL query inside another for complex filtering and data retrieval operations. - `fields command `_ 1.0 stable (since 1.0) Keep or remove fields from the search result. - `rename command `_ 1.0 stable (since 1.0) Rename one or more fields in the search result. - `eval command `_ 1.0 stable (since 1.0) Evaluate an expression and append the result to the search result. - `replace command `_ 3.4 experimental (since 3.4) Replace text in one or more fields in the search result - `fillnull command `_ 3.0 experimental (since 3.0) Fill null with provided value in one or more fields in the search result. - `expand command `_ 3.1 experimental (since 3.1) Transform a single document into multiple documents by expanding a nested array field. - `flatten command `_ 3.1 experimental (since 3.1) Flatten a struct or an object field into separate fields in a document. - `table command `_ 3.3 experimental (since 3.3) Keep or remove fields from the search result using enhanced syntax options. - `stats command `_ 1.0 stable (since 1.0) Calculate aggregation from search results. - `eventstats command `_ 3.1 experimental (since 3.1) Calculate aggregation statistics and add them as new fields to each event. - `streamstats command `_ 3.4 experimental (since 3.4) Calculate cumulative or rolling statistics as events are processed in order. - `bin command `_ 3.3 experimental (since 3.3) Group numeric values into buckets of equal intervals. - `timechart command `_ 3.3 experimental (since 3.3) Create time-based charts and visualizations. - `chart command `_ 3.4 experimental (since 3.4) Apply statistical aggregations to search results and group the data for visualizations. - `trendline command `_ 3.0 experimental (since 3.0) Calculate moving averages of fields. - `sort command `_ 1.0 stable (since 1.0) Sort all the search results by the specified fields. - `reverse command `_ 3.2 experimental (since 3.2) Reverse the display order of search results. - `head command `_ 1.0 stable (since 1.0) Return the first N number of specified results after an optional offset in search order. - `dedup command `_ 1.0 stable (since 1.0) Remove identical documents defined by the field from the search result. - `top command `_ 1.0 stable (since 1.0) Find the most common tuple of values of all fields in the field list. - `rare command `_ 1.0 stable (since 1.0) Find the least common tuple of values of all fields in the field list. - `parse command `_ 1.3 stable (since 1.3) Parse a text field with a regular expression and append the result to the search result. - `grok command `_ 2.4 stable (since 2.4) Parse a text field with a grok pattern and append the results to the search result. - `rex command `_ 3.3 experimental (since 3.3) Extract fields from a raw text field using regular expression named capture groups. - `regex command `_ 3.3 experimental (since 3.3) Filter search results by matching field values against a regular expression pattern. - `spath command `_ 3.3 experimental (since 3.3) Extract fields from structured text data. - `patterns command `_ 2.4 stable (since 2.4) Extract log patterns from a text field and append the results to the search result. - `join command `_ 3.0 stable (since 3.0) Combine two datasets together. - `append command `_ 3.3 experimental (since 3.3) Append the result of a sub-search to the bottom of the input search results. - `appendcol command `_ 3.1 experimental (since 3.1) Append the result of a sub-search and attach it alongside the input search results. - `lookup command `_ 3.0 experimental (since 3.0) Add or replace data from a lookup index. - `multisearch command `_ 3.4 experimental (since 3.4) Execute multiple search queries and combine their results. - `ml command `_: 2.5 stable (since 2.5) Apply machine learning algorithms to analyze data. - `kmeans command `_ 1.3 stable (since 1.3) Apply the kmeans algorithm on the search result returned by a PPL command. - `ad command `_ 1.3 deprecated (since 2.5) Apply Random Cut Forest algorithm on the search result returned by a PPL command. - `describe command `_ 2.1 stable (since 2.1) Query the metadata of an index. - `explain command `_ 3.1 stable (since 3.1) Explain the plan of query. - `show datasources command `_ 2.4 stable (since 2.4) Query datasources configured in the PPL engine. - ============================================================== ================== ======================== ============================================================================================== - - - `Syntax `_ - PPL query structure and command syntax formatting - -* **Functions** - - - `Aggregation Functions `_ - - - `Collection Functions `_ - - - `Condition Functions `_ - - - `Cryptographic Functions `_ - - - `Date and Time Functions `_ - - - `Expressions `_ - - - `IP Address Functions `_ - - - `JSON Functions `_ - - - `Math Functions `_ - - - `Relevance Functions `_ - - - `String Functions `_ - - - `System Functions `_ - - - `Type Conversion Functions `_ - -* **Optimization** - - - `Optimization <../../user/optimization/optimization.rst>`_ - -* **Limitations** - - - `Limitations `_ diff --git a/docs/user/ppl/interfaces/endpoint.md b/docs/user/ppl/interfaces/endpoint.md new file mode 100644 index 00000000000..e1e9cf705bf --- /dev/null +++ b/docs/user/ppl/interfaces/endpoint.md @@ -0,0 +1,154 @@ +# Endpoint + +## Introduction + +To send query request to PPL plugin, you MUST use HTTP POST request. POST request doesn't have length limitation and allows for other parameters passed to plugin for other functionality such as prepared statement. And also the explain endpoint is used very often for query translation and troubleshooting. +## POST + +### Description + +You can send HTTP POST request to endpoint **/_plugins/_ppl** with your query in request body. +### Example + +```bash ppl +curl -sS -H 'Content-Type: application/json' \ +-X POST localhost:9200/_plugins/_ppl \ +-d '{"query" : "source=accounts | fields firstname, lastname"}' +``` + +Expected output: + +```json +{ + "schema": [ + { + "name": "firstname", + "type": "string" + }, + { + "name": "lastname", + "type": "string" + } + ], + "datarows": [ + [ + "Amber", + "Duke" + ], + [ + "Hattie", + "Bond" + ], + [ + "Nanette", + "Bates" + ], + [ + "Dale", + "Adams" + ] + ], + "total": 4, + "size": 4 +} +``` + +## Explain + +### Description + +You can send HTTP explain request to endpoint **/_plugins/_ppl/_explain** with your query in request body to understand the execution plan for the PPL query. The explain endpoint is useful when user want to get insight how the query is executed in the engine. +### Description + +To translate your query, send it to explain endpoint. The explain output is OpenSearch domain specific language (DSL) in JSON format. You can just copy and paste it to your console to run it against OpenSearch directly. +Explain output could be set different formats: `standard` (the default format), `simple`, `extended`, `dsl`. +### Example 1 default (standard) format + +Explain query + +```bash ppl +curl -sS -H 'Content-Type: application/json' \ +-X POST localhost:9200/_plugins/_ppl/_explain \ +-d '{"query" : "source=state_country | where age>30"}' +``` + +Expected output: + +```json +{ + "calcite": { + "logical": "LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])\n LogicalProject(name=[$0], country=[$1], state=[$2], month=[$3], year=[$4], age=[$5])\n LogicalFilter(condition=[>($5, 30)])\n CalciteLogicalIndexScan(table=[[OpenSearch, state_country]])\n", + "physical": "CalciteEnumerableIndexScan(table=[[OpenSearch, state_country]], PushDownContext=[[PROJECT->[name, country, state, month, year, age], FILTER->>($5, 30), LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={\"from\":0,\"size\":10000,\"timeout\":\"1m\",\"query\":{\"range\":{\"age\":{\"from\":30,\"to\":null,\"include_lower\":false,\"include_upper\":true,\"boost\":1.0}}},\"_source\":{\"includes\":[\"name\",\"country\",\"state\",\"month\",\"year\",\"age\"],\"excludes\":[]}}, requestedTotalSize=10000, pageSize=null, startFrom=0)])\n" + } +} + +``` + +### Example 2 simple format + +Explain query + +```bash ppl +curl -sS -H 'Content-Type: application/json' \ +-X POST localhost:9200/_plugins/_ppl/_explain?format=simple \ +-d '{"query" : "source=state_country | where age>30"}' +``` + +Expected output: + +```json +{ + "calcite": { + "logical": "LogicalSystemLimit\n LogicalProject\n LogicalFilter\n CalciteLogicalIndexScan\n" + } +} +``` + +### Example 3 extended format + +Explain query + +```bash ppl +curl -sS -H 'Content-Type: application/json' \ +-X POST localhost:9200/_plugins/_ppl/_explain?format=extended \ +-d '{"query" : "source=state_country | where age>30"}' +``` + +Expected output: + +```json +{ + "calcite": { + "logical": "LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])\n LogicalProject(name=[$0], country=[$1], state=[$2], month=[$3], year=[$4], age=[$5])\n LogicalFilter(condition=[>($5, 30)])\n CalciteLogicalIndexScan(table=[[OpenSearch, state_country]])\n", + "physical": "CalciteEnumerableIndexScan(table=[[OpenSearch, state_country]], PushDownContext=[[PROJECT->[name, country, state, month, year, age], FILTER->>($5, 30), LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={\"from\":0,\"size\":10000,\"timeout\":\"1m\",\"query\":{\"range\":{\"age\":{\"from\":30,\"to\":null,\"include_lower\":false,\"include_upper\":true,\"boost\":1.0}}},\"_source\":{\"includes\":[\"name\",\"country\",\"state\",\"month\",\"year\",\"age\"],\"excludes\":[]}}, requestedTotalSize=10000, pageSize=null, startFrom=0)])\n", + "extended": "public org.apache.calcite.linq4j.Enumerable bind(final org.apache.calcite.DataContext root) {\n final org.opensearch.sql.opensearch.storage.scan.CalciteEnumerableIndexScan v1stashed = (org.opensearch.sql.opensearch.storage.scan.CalciteEnumerableIndexScan) root.get(\"v1stashed\");\n return v1stashed.scan();\n}\n\n\npublic Class getElementType() {\n return java.lang.Object[].class;\n}\n\n\n" + } +} +``` + +### Example 4 YAML format (experimental) + + YAML explain output is an experimental feature and not intended for + production use. The interface and output may change without notice. +Return Explain response format in In `yaml` format. +Explain query + +```bash ppl +curl -sS -H 'Content-Type: application/json' \ +-X POST localhost:9200/_plugins/_ppl/_explain?format=yaml \ +-d '{"query" : "source=state_country | where age>30"}' +``` + +Expected output: + +```yaml +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(name=[$0], country=[$1], state=[$2], month=[$3], year=[$4], age=[$5]) + LogicalFilter(condition=[>($5, 30)]) + CalciteLogicalIndexScan(table=[[OpenSearch, state_country]]) + physical: | + CalciteEnumerableIndexScan(table=[[OpenSearch, state_country]], PushDownContext=[[PROJECT->[name, country, state, month, year, age], FILTER->>($5, 30), LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":10000,"timeout":"1m","query":{"range":{"age":{"from":30,"to":null,"include_lower":false,"include_upper":true,"boost":1.0}}},"_source":{"includes":["name","country","state","month","year","age"],"excludes":[]}}, requestedTotalSize=10000, pageSize=null, startFrom=0)]) +``` + \ No newline at end of file diff --git a/docs/user/ppl/interfaces/endpoint.rst b/docs/user/ppl/interfaces/endpoint.rst deleted file mode 100644 index 08032ad6cda..00000000000 --- a/docs/user/ppl/interfaces/endpoint.rst +++ /dev/null @@ -1,150 +0,0 @@ -.. highlight:: sh - -======== -Endpoint -======== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 1 - - -Introduction -============ - -To send query request to PPL plugin, you MUST use HTTP POST request. POST request doesn't have length limitation and allows for other parameters passed to plugin for other functionality such as prepared statement. And also the explain endpoint is used very often for query translation and troubleshooting. - -POST -==== - -Description ------------ - -You can send HTTP POST request to endpoint **/_plugins/_ppl** with your query in request body. - -Example -------- - -PPL query:: - - sh$ curl -sS -H 'Content-Type: application/json' \ - ... -X POST localhost:9200/_plugins/_ppl \ - ... -d '{"query" : "source=accounts | fields firstname, lastname"}' - { - "schema": [ - { - "name": "firstname", - "type": "string" - }, - { - "name": "lastname", - "type": "string" - } - ], - "datarows": [ - [ - "Amber", - "Duke" - ], - [ - "Hattie", - "Bond" - ], - [ - "Nanette", - "Bates" - ], - [ - "Dale", - "Adams" - ] - ], - "total": 4, - "size": 4 - } - -Explain -======= - -Description ------------ - -You can send HTTP explain request to endpoint **/_plugins/_ppl/_explain** with your query in request body to understand the execution plan for the PPL query. The explain endpoint is useful when user want to get insight how the query is executed in the engine. - -Description ------------ - -To translate your query, send it to explain endpoint. The explain output is OpenSearch domain specific language (DSL) in JSON format. You can just copy and paste it to your console to run it against OpenSearch directly. - -Explain output could be set different formats: ``standard`` (the default format), ``simple``, ``extended``, ``dsl``. - - -Example 1 default (standard) format ------------------------------------ - -Explain query:: - - sh$ curl -sS -H 'Content-Type: application/json' \ - ... -X POST localhost:9200/_plugins/_ppl/_explain \ - ... -d '{"query" : "source=state_country | where age>30"}' - { - "calcite": { - "logical": "LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])\n LogicalProject(name=[$0], country=[$1], state=[$2], month=[$3], year=[$4], age=[$5])\n LogicalFilter(condition=[>($5, 30)])\n CalciteLogicalIndexScan(table=[[OpenSearch, state_country]])\n", - "physical": "CalciteEnumerableIndexScan(table=[[OpenSearch, state_country]], PushDownContext=[[PROJECT->[name, country, state, month, year, age], FILTER->>($5, 30), LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={\"from\":0,\"size\":10000,\"timeout\":\"1m\",\"query\":{\"range\":{\"age\":{\"from\":30,\"to\":null,\"include_lower\":false,\"include_upper\":true,\"boost\":1.0}}},\"_source\":{\"includes\":[\"name\",\"country\",\"state\",\"month\",\"year\",\"age\"],\"excludes\":[]}}, requestedTotalSize=10000, pageSize=null, startFrom=0)])\n" - } - } - -Example 2 simple format ------------------------ - -Explain query:: - - sh$ curl -sS -H 'Content-Type: application/json' \ - ... -X POST localhost:9200/_plugins/_ppl/_explain?format=simple \ - ... -d '{"query" : "source=state_country | where age>30"}' - { - "calcite": { - "logical": "LogicalSystemLimit\n LogicalProject\n LogicalFilter\n CalciteLogicalIndexScan\n" - } - } - -Example 3 extended format -------------------------- - -Explain query:: - - sh$ curl -sS -H 'Content-Type: application/json' \ - ... -X POST localhost:9200/_plugins/_ppl/_explain?format=extended \ - ... -d '{"query" : "source=state_country | where age>30"}' - { - "calcite": { - "logical": "LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])\n LogicalProject(name=[$0], country=[$1], state=[$2], month=[$3], year=[$4], age=[$5])\n LogicalFilter(condition=[>($5, 30)])\n CalciteLogicalIndexScan(table=[[OpenSearch, state_country]])\n", - "physical": "CalciteEnumerableIndexScan(table=[[OpenSearch, state_country]], PushDownContext=[[PROJECT->[name, country, state, month, year, age], FILTER->>($5, 30), LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={\"from\":0,\"size\":10000,\"timeout\":\"1m\",\"query\":{\"range\":{\"age\":{\"from\":30,\"to\":null,\"include_lower\":false,\"include_upper\":true,\"boost\":1.0}}},\"_source\":{\"includes\":[\"name\",\"country\",\"state\",\"month\",\"year\",\"age\"],\"excludes\":[]}}, requestedTotalSize=10000, pageSize=null, startFrom=0)])\n", - "extended": "public org.apache.calcite.linq4j.Enumerable bind(final org.apache.calcite.DataContext root) {\n final org.opensearch.sql.opensearch.storage.scan.CalciteEnumerableIndexScan v1stashed = (org.opensearch.sql.opensearch.storage.scan.CalciteEnumerableIndexScan) root.get(\"v1stashed\");\n return v1stashed.scan();\n}\n\n\npublic Class getElementType() {\n return java.lang.Object[].class;\n}\n\n\n" - } - } - -Example 4 YAML format (experimental) ------------------------------------ - -.. note:: - YAML explain output is an experimental feature and not intended for - production use. The interface and output may change without notice. - -Return Explain response format in In ``yaml`` format. - -Explain query:: - - sh$ curl -sS -H 'Content-Type: application/json' \ - ... -X POST localhost:9200/_plugins/_ppl/_explain?format=yaml \ - ... -d '{"query" : "source=state_country | where age>30"}' - calcite: - logical: | - LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) - LogicalProject(name=[$0], country=[$1], state=[$2], month=[$3], year=[$4], age=[$5]) - LogicalFilter(condition=[>($5, 30)]) - CalciteLogicalIndexScan(table=[[OpenSearch, state_country]]) - physical: | - CalciteEnumerableIndexScan(table=[[OpenSearch, state_country]], PushDownContext=[[PROJECT->[name, country, state, month, year, age], FILTER->>($5, 30), LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":10000,"timeout":"1m","query":{"range":{"age":{"from":30,"to":null,"include_lower":false,"include_upper":true,"boost":1.0}}},"_source":{"includes":["name","country","state","month","year","age"],"excludes":[]}}, requestedTotalSize=10000, pageSize=null, startFrom=0)]) diff --git a/docs/user/ppl/interfaces/protocol.md b/docs/user/ppl/interfaces/protocol.md new file mode 100644 index 00000000000..680f01fd379 --- /dev/null +++ b/docs/user/ppl/interfaces/protocol.md @@ -0,0 +1,130 @@ +# Protocol + +## Introduction + +For the protocol, PPL endpoint provides response formats in the JDBC format. JDBC format is widely used because it provides schema information and more functionality such as pagination. Besides JDBC driver, various clients can benefit from the detailed and well formatted response. +## Request/Response Format + +### Description + +The body of HTTP POST request can take PPL query. +### Example 1 + +```bash ppl +curl -sS -H 'Content-Type: application/json' \ +-X POST localhost:9200/_plugins/_ppl \ +-d '{"query" : "source=accounts | fields firstname, lastname"}' +``` + +Expected output: + +```json +{ + "schema": [ + { + "name": "firstname", + "type": "string" + }, + { + "name": "lastname", + "type": "string" + } + ], + "datarows": [ + [ + "Amber", + "Duke" + ], + [ + "Hattie", + "Bond" + ], + [ + "Nanette", + "Bates" + ], + [ + "Dale", + "Adams" + ] + ], + "total": 4, + "size": 4 +} +``` + +## JDBC Format + +### Description + +By default the plugin return JDBC format. JDBC format is provided for JDBC driver and client side that needs both schema and result set well formatted. +### Example 1 + +Here is an example for normal response. The `schema` includes field name and its type and `datarows` includes the result set. + +```bash ppl +curl -sS -H 'Content-Type: application/json' \ +-X POST localhost:9200/_plugins/_ppl \ +-d '{"query" : "source=accounts | fields firstname, lastname"}' +``` + +Expected output: + +```json +{ + "schema": [ + { + "name": "firstname", + "type": "string" + }, + { + "name": "lastname", + "type": "string" + } + ], + "datarows": [ + [ + "Amber", + "Duke" + ], + [ + "Hattie", + "Bond" + ], + [ + "Nanette", + "Bates" + ], + [ + "Dale", + "Adams" + ] + ], + "total": 4, + "size": 4 +} +``` + +### Example 2 + +If any error occurred, error message and the cause will be returned instead. + +```bash ppl +curl -sS -H 'Content-Type: application/json' \ +-X POST localhost:9200/_plugins/_ppl \ +-d '{"query" : "source=unknown | fields firstname, lastname"}' +``` + +Expected output: + +```json +{ + "error": { + "reason": "Error occurred in OpenSearch engine: no such index [unknown]", + "details": "[unknown] IndexNotFoundException[no such index [unknown]]\nFor more details, please send request for Json format to see the raw response from OpenSearch engine.", + "type": "IndexNotFoundException" + }, + "status": 404 +} +``` + \ No newline at end of file diff --git a/docs/user/ppl/interfaces/protocol.rst b/docs/user/ppl/interfaces/protocol.rst deleted file mode 100644 index a76dba301b5..00000000000 --- a/docs/user/ppl/interfaces/protocol.rst +++ /dev/null @@ -1,137 +0,0 @@ -.. highlight:: sh - -======== -Protocol -======== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 1 - - -Introduction -============ - -For the protocol, PPL endpoint provides response formats in the JDBC format. JDBC format is widely used because it provides schema information and more functionality such as pagination. Besides JDBC driver, various clients can benefit from the detailed and well formatted response. - - -Request/Response Format -============== - -Description ------------ - -The body of HTTP POST request can take PPL query. - -Example 1 ---------- -PPL query:: - - sh$ curl -sS -H 'Content-Type: application/json' \ - ... -X POST localhost:9200/_plugins/_ppl \ - ... -d '{"query" : "source=accounts | fields firstname, lastname"}' - { - "schema": [ - { - "name": "firstname", - "type": "string" - }, - { - "name": "lastname", - "type": "string" - } - ], - "datarows": [ - [ - "Amber", - "Duke" - ], - [ - "Hattie", - "Bond" - ], - [ - "Nanette", - "Bates" - ], - [ - "Dale", - "Adams" - ] - ], - "total": 4, - "size": 4 - } - -JDBC Format -=========== - -Description ------------ - -By default the plugin return JDBC format. JDBC format is provided for JDBC driver and client side that needs both schema and result set well formatted. - -Example 1 ---------- - -Here is an example for normal response. The `schema` includes field name and its type and `datarows` includes the result set. - -PPL query:: - - sh$ curl -sS -H 'Content-Type: application/json' \ - ... -X POST localhost:9200/_plugins/_ppl \ - ... -d '{"query" : "source=accounts | fields firstname, lastname"}' - { - "schema": [ - { - "name": "firstname", - "type": "string" - }, - { - "name": "lastname", - "type": "string" - } - ], - "datarows": [ - [ - "Amber", - "Duke" - ], - [ - "Hattie", - "Bond" - ], - [ - "Nanette", - "Bates" - ], - [ - "Dale", - "Adams" - ] - ], - "total": 4, - "size": 4 - } - -Example 2 ---------- - -If any error occurred, error message and the cause will be returned instead. - -PPL query:: - - sh$ curl -sS -H 'Content-Type: application/json' \ - ... -X POST localhost:9200/_plugins/_ppl \ - ... -d '{"query" : "source=unknown | fields firstname, lastname"}' - { - "error": { - "reason": "Error occurred in OpenSearch engine: no such index [unknown]", - "details": "[unknown] IndexNotFoundException[no such index [unknown]]\nFor more details, please send request for Json format to see the raw response from OpenSearch engine.", - "type": "IndexNotFoundException" - }, - "status": 404 - } - diff --git a/docs/user/ppl/limitations/limitations.md b/docs/user/ppl/limitations/limitations.md new file mode 100644 index 00000000000..6ef9bd7407b --- /dev/null +++ b/docs/user/ppl/limitations/limitations.md @@ -0,0 +1,89 @@ +# Limitations + +## Inconsistent Field Types across indices + +* If the same field has different types across indices (e.g., `field` is a `string` in one index and an `integer` in another), PPL selects a field type from one of the indices—this selection is non-deterministic. Fields with other types are ignored during query execution. +* For `object` fields, [PPL merges subfields from different indices to tolerate schema variations](https://github.com/opensearch-project/sql/issues/3625). + +## Unsupported OpenSearch Field Types + +PPL does not support all [OpenSearch data types](https://docs.opensearch.org/latest/mappings/supported-field-types/index/). (e.g., `flattened`, some complex `nested` usages). Unsupported fields are excluded from `DESCRIBE` and `SOURCE` outputs. At runtime: Queries referencing unsupported fields fail with semantic or resolution errors. Such fields are ignored in projections unless explicitly filtered out or removed at ingestion. + +| OpenSearch Data Type | PPL | +| --- | --- | +| knn_vector | Ignored | +| Range field types | Ignored | +| Object - flat_object | Ignored | +| Object - join | Ignored | +| String - Match-only text | Ignored | +| String - Wildcard | Ignored | +| String - token_count | Ignored | +| String - constant_keyword | Ignored | +| Autocomplete | Ignored | +| Geoshape | Ignored | +| Cartesian field types | Ignored | +| Rank field types | Ignored | +| Star-tree | Ignored | +| derived | Ignored | +| Percolator | Ignored | + +## Field Parameters + +For a field to be queryable in PPL, the following index settings must be enabled: + +| Setting | Description | Required For | +| --- | --- | --- | +| _source: true | Stores the original JSON document | Required for fetch raw data. | +| index: true | Enables field indexing | Required for filtering, search, and aggregations | +| doc_values: true | Enables columnar access for aggregations/sorting | Required for `stats`, `sort` | + +## Nested Field Behavior + +* There are [limitations](https://github.com/opensearch-project/sql/issues/52) regarding the nested levels and query types that needs improvement. + +## Multi-value Field Behavior + +OpenSearch does not natively support the ARRAY data type but does allow multi-value fields implicitly. The +SQL/PPL plugin adheres strictly to the data type semantics defined in index mappings. When parsing OpenSearch +responses, it expects data to match the declared type and does not account for data in array format. If the +plugins.query.field_type_tolerance setting is enabled, the SQL/PPL plugin will handle array datasets by returning +scalar data types, allowing basic queries (e.g., source = tbl \| where condition). However, using multi-value +fields in expressions or functions will result in exceptions. If this setting is disabled or absent, only the +first element of an array is returned, preserving the default behavior. +## Unsupported Functionalities in Calcite Engine + +Since 3.0.0, we introduce Apache Calcite as an experimental query engine. Please see [introduce v3 engine](../../../dev/intro-v3-engine.md). +For the following functionalities, the query will be forwarded to the V2 query engine. It means following functionalities cannot work with new PPL commands/functions introduced in 3.0.0 and above. +* All SQL queries +* PPL Queries against non-OpenSearch data sources +* `dedup` with `consecutive=true` +* Search relevant commands + * AD + * ML + * Kmeans +* `show datasources` and command +* Commands with `fetch_size` parameter + + +## Malformed Field Names in Object Fields + +OpenSearch normally rejects field names containing problematic dot patterns (such as `.`, `..`, `.a`, `a.`, or `a..b`). However, when an object field has `enabled: false`, OpenSearch bypasses field name validation and allows storing documents with any field names. + +If a document contains malformed field names inside an object field, PPL ignores those malformed field names. Other valid fields in the document are returned normally. + +**Example of affected data:** + +```json +{ + "log": { + ".": "value1", + ".a": "value2", + "a.": "value3", + "a..b": "value4" + } +} +``` + +When ``log`` is an object field with ``enabled: false``, subfields with malformed names are ignored. + +**Recommendation:** Avoid using field names that contain leading dots, trailing dots, consecutive dots, or consist only of dots. This aligns with OpenSearch's default field naming requirements. diff --git a/docs/user/ppl/limitations/limitations.rst b/docs/user/ppl/limitations/limitations.rst deleted file mode 100644 index 41d3a007d23..00000000000 --- a/docs/user/ppl/limitations/limitations.rst +++ /dev/null @@ -1,132 +0,0 @@ -=========== -Limitations -=========== - -.. rubric:: Table of contents - -.. contents:: - :local: - :depth: 2 - -Inconsistent Field Types across indices -======================================= - -* If the same field has different types across indices (e.g., ``field`` is a ``string`` in one index and an ``integer`` in another), PPL selects a field type from one of the indices—this selection is non-deterministic. Fields with other types are ignored during query execution. -* For ``object`` fields, `PPL merges subfields from different indices to tolerate schema variations `_. - -Unsupported OpenSearch Field Types -================================== - -PPL does not support all `OpenSearch data types `_. (e.g., ``flattened``, some complex ``nested`` usages). Unsupported fields are excluded from ``DESCRIBE`` and ``SOURCE`` outputs. At runtime: Queries referencing unsupported fields fail with semantic or resolution errors. Such fields are ignored in projections unless explicitly filtered out or removed at ingestion. - -+---------------------------+---------+ -| OpenSearch Data Type | PPL | -+===========================+=========+ -| knn_vector | Ignored | -+---------------------------+---------+ -| Range field types | Ignored | -+---------------------------+---------+ -| Object - flat_object | Ignored | -+---------------------------+---------+ -| Object - join | Ignored | -+---------------------------+---------+ -| String - Match-only text | Ignored | -+---------------------------+---------+ -| String - Wildcard | Ignored | -+---------------------------+---------+ -| String - token_count | Ignored | -+---------------------------+---------+ -| String - constant_keyword | Ignored | -+---------------------------+---------+ -| Autocomplete | Ignored | -+---------------------------+---------+ -| Geoshape | Ignored | -+---------------------------+---------+ -| Cartesian field types | Ignored | -+---------------------------+---------+ -| Rank field types | Ignored | -+---------------------------+---------+ -| Star-tree | Ignored | -+---------------------------+---------+ -| derived | Ignored | -+---------------------------+---------+ -| Percolator | Ignored | -+---------------------------+---------+ - -Field Parameters -================ - -For a field to be queryable in PPL, the following index settings must be enabled: - -+------------------+--------------------------------------------------+--------------------------------------------------+ -| Setting | Description | Required For | -+==================+==================================================+==================================================+ -| _source: true | Stores the original JSON document | Required for fetch raw data. | -+------------------+--------------------------------------------------+--------------------------------------------------+ -| index: true | Enables field indexing | Required for filtering, search, and aggregations | -+------------------+--------------------------------------------------+--------------------------------------------------+ -| doc_values: true | Enables columnar access for aggregations/sorting | Required for `stats`, `sort` | -+------------------+--------------------------------------------------+--------------------------------------------------+ - - -Nested Field Behavior -===================== - -* There are `limitations `_ regarding the nested levels and query types that needs improvement. - -Multi-value Field Behavior -========================== - -OpenSearch does not natively support the ARRAY data type but does allow multi-value fields implicitly. The -SQL/PPL plugin adheres strictly to the data type semantics defined in index mappings. When parsing OpenSearch -responses, it expects data to match the declared type and does not account for data in array format. If the -plugins.query.field_type_tolerance setting is enabled, the SQL/PPL plugin will handle array datasets by returning -scalar data types, allowing basic queries (e.g., source = tbl | where condition). However, using multi-value -fields in expressions or functions will result in exceptions. If this setting is disabled or absent, only the -first element of an array is returned, preserving the default behavior. - -Unsupported Functionalities in Calcite Engine -============================================= - -Since 3.0.0, we introduce Apache Calcite as an experimental query engine. Please see `introduce v3 engine <../../../dev/intro-v3-engine.md>`_. -For the following functionalities, the query will be forwarded to the V2 query engine. It means following functionalities cannot work with new PPL commands/functions introduced in 3.0.0 and above. - -* All SQL queries - -* PPL Queries against non-OpenSearch data sources - -* ``dedup`` with ``consecutive=true`` - -* Search relevant commands - - * AD - * ML - * Kmeans - -* ``show datasources`` and command - -* Commands with ``fetch_size`` parameter - -Malformed Field Names in Object Fields -====================================== - -OpenSearch normally rejects field names containing problematic dot patterns (such as ``.``, ``..``, ``.a``, ``a.``, or ``a..b``). However, when an object field has ``enabled: false``, OpenSearch bypasses field name validation and allows storing documents with any field names. - -If a document contains malformed field names inside an object field, PPL ignores those malformed field names. Other valid fields in the document are returned normally. - -**Example of affected data:** - -.. code-block:: json - - { - "log": { - ".": "value1", - ".a": "value2", - "a.": "value3", - "a..b": "value4" - } - } - -When ``log`` is an object field with ``enabled: false``, subfields with malformed names are ignored. - -**Recommendation:** Avoid using field names that contain leading dots, trailing dots, consecutive dots, or consist only of dots. This aligns with OpenSearch's default field naming requirements. diff --git a/docs/user/ppl/reference/splunk_to_ppl_cheat_sheet.md b/docs/user/ppl/reference/splunk_to_ppl_cheat_sheet.md index 25d726f8be4..9111141078a 100644 --- a/docs/user/ppl/reference/splunk_to_ppl_cheat_sheet.md +++ b/docs/user/ppl/reference/splunk_to_ppl_cheat_sheet.md @@ -1,73 +1,73 @@ -# Splunk to OpenSearch PPL Cheat Sheet +# Splunk to OpenSearch PPL Cheat Sheet This cheat sheet helps Splunk users transition to OpenSearch's PPL. It maps common Splunk Search Processing Language (SPL) commands to their PPL equivalents with examples. -## Structure and Concepts - +## Structure and Concepts + | Aspect | Splunk SPL | OpenSearch PPL | Notes | |--------|------------|---------------|-------| | Query structure | `search terms \| command` | `search term source = index \| command` | PPL requires explicit source at the beginning | -| Index reference | `index=name*` | `source=name*` | Different command to specify data source, [PPL support refering to multiple indices](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/general/identifiers.rst#multiple-indices)| +| Index reference | `index=name*` | `source=name*` | Different command to specify data source, [PPL support refering to multiple indices](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/general/identifiers.md#multiple-indices)| | Raw field | Special `_raw` field | Identify a field in your OpenSearch data that contains the text content you want to work with (often `message` or `content` fields in log data) | default field configured by the index.query.default_field setting (defaults to * which searches all fields) | | Time field | Special `_time` field | User-specified timestamp field | PPL use @timestamp by default | + - -## Command Reference +## Command Reference This table provides a mapping between Splunk SPL commands and their OpenSearch PPL equivalents: - + | Splunk SPL | OpenSearch PPL | Purpose | |------------|---------------|---------| -| append | [append](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/append.rst) | Append results from subsearch | -| appendcols | [appendcols](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/appendcol.rst) | Append columns from subsearch | -| bin | [bin](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/bin.rst) | Group numeric values into bins | -| bucket | [bin](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/bin.rst) | Group numeric values into bins | -| dedup | [dedup](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/dedup.rst) | Remove duplicate results | -| eval | [eval](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/eval.rst) | Calculate and create new fields | -| eventstats | [eventstats](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/eventstats.rst) | Calculate statistics while preserving events | -| mvexpand | [expand](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/expand.rst) | Expand multi-value fields | -| fields | [fields](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/fields.rst) | Include or exclude fields | -| fillnull | [fillnull](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/fillnull.rst) | Replace null values | -| head | [head](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/head.rst) | Retrieve the first N results | -| join | [join](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/join.rst) | Combine results from multiple sources | -| lookup | [lookup](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/lookup.rst) | Enrich data with lookups | -| rare | [rare](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/rare.rst) | Find the least common values | -| regex | [regex](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/regex.rst) | Filter with regular expression pattern | -| rename | [rename](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/rename.rst) | Rename fields in results | -| reverse | [reverse](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/reverse.rst) | Reverse the order of search results | -| rex | [rex](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/rex.rst) | Extract with regular expression pattern | -| search | [search](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/search.rst) | Basic searching of data | -| sort | [sort](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/sort.rst) | Sort results by specified fields | -| spath | [spath](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/spath.rst) | Extracting fields from structured text data | -| stats | [stats](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/stats.rst) | Statistical aggregation of data | -| subsearch | [subsearch](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/subquery.rst) | Enrich main search | -| table | [table](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/table.rst) | Select specific fields to display | -| timechart | [timechart](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/timechart.rst) | Statistical aggregation of time-series data | -| top | [top](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/top.rst) | Find the most common values | -| trendline | [trendline](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/trendline.rst) | Calculate moving averages of fields | -| where | [where](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/where.rst) | Filter results based on conditions | - - -## Example Query Conversions +| append | [append](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/append.md) | Append results from subsearch | +| appendcols | [appendcols](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/appendcol.md) | Append columns from subsearch | +| bin | [bin](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/bin.md) | Group numeric values into bins | +| bucket | [bin](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/bin.md) | Group numeric values into bins | +| dedup | [dedup](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/dedup.md) | Remove duplicate results | +| eval | [eval](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/eval.md) | Calculate and create new fields | +| eventstats | [eventstats](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/eventstats.md) | Calculate statistics while preserving events | +| mvexpand | [expand](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/expand.md) | Expand multi-value fields | +| fields | [fields](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/fields.md) | Include or exclude fields | +| fillnull | [fillnull](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/fillnull.md) | Replace null values | +| head | [head](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/head.md) | Retrieve the first N results | +| join | [join](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/join.md) | Combine results from multiple sources | +| lookup | [lookup](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/lookup.md) | Enrich data with lookups | +| rare | [rare](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/rare.md) | Find the least common values | +| regex | [regex](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/regex.md) | Filter with regular expression pattern | +| rename | [rename](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/rename.md) | Rename fields in results | +| reverse | [reverse](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/reverse.md) | Reverse the order of search results | +| rex | [rex](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/rex.md) | Extract with regular expression pattern | +| search | [search](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/search.md) | Basic searching of data | +| sort | [sort](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/sort.md) | Sort results by specified fields | +| spath | [spath](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/spath.md) | Extracting fields from structured text data | +| stats | [stats](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/stats.md) | Statistical aggregation of data | +| subsearch | [subsearch](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/subquery.md) | Enrich main search | +| table | [table](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/table.md) | Select specific fields to display | +| timechart | [timechart](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/timechart.md) | Statistical aggregation of time-series data | +| top | [top](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/top.md) | Find the most common values | +| trendline | [trendline](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/trendline.md) | Calculate moving averages of fields | +| where | [where](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/cmd/where.md) | Filter results based on conditions | + + +## Example Query Conversions **Simple search:** -- Splunk: `error failed status=500` -- PPL: ```source=`*` error failed status=500``` +- Splunk: `error failed status=500` +- PPL: ```source=`*` error failed status=500``` **Aggregation:** -- Splunk: `... | stats count by host, status | sort -count` -- PPL: `... | stats count() by host, status | sort - count` +- Splunk: `... | stats count by host, status | sort -count` +- PPL: `... | stats count() by host, status | sort - count` **Time-based query:** -- Splunk: `... | timechart span=1h count by host` -- PPL: `... | timechart span=1h count by host` +- Splunk: `... | timechart span=1h count by host` +- PPL: `... | timechart span=1h count by host` **Complex calculation:** -- Splunk: `... | eval mb=bytes/1024/1024 | stats avg(mb) AS avg_mb by host | where avg_mb > 100` -- PPL: `... | eval mb=bytes/1024/1024 | stats avg(mb) as avg_mb by host | where avg_mb > 100` - -## Basic Search Syntax +- Splunk: `... | eval mb=bytes/1024/1024 | stats avg(mb) AS avg_mb by host | where avg_mb > 100` +- PPL: `... | eval mb=bytes/1024/1024 | stats avg(mb) as avg_mb by host | where avg_mb > 100` +## Basic Search Syntax + | Operation | Splunk SPL | OpenSearch PPL | Notes | |-----------|------------|---------------|-------| | Basic search | `error` | `error` | Same syntax | @@ -77,26 +77,26 @@ This table provides a mapping between Splunk SPL commands and their OpenSearch P | Multiple values | `field IN (404, 503)` | `field in (404, 503)` | Same syntax | | Field doesn't equal | `field!=404` | `field!=404` | Same syntax | | Wildcard search | `field=value*` | `field=value*` | Same syntax | - -## Field Selection and Manipulation - + +## Field Selection and Manipulation + | Operation | Splunk SPL | OpenSearch PPL | Notes | |-----------|------------|---------------|-------| | Select fields | `... \| fields field1, field2` | `... \| fields field1, field2` | Same syntax | | Exclude fields | `... \| fields - field3` | `... \| fields - field3` | Same syntax | | Rename fields | `... \| rename field1 AS new_name` | `... \| rename field1 as new_name` | Same syntax | | Calculate field | `... \| eval new_field=field1 + field2` | `... \| eval new_field = field1 + field2` | Same syntax | - -## Filtering - + +## Filtering + | Operation | Splunk SPL | OpenSearch PPL | Notes | |-----------|------------|---------------|-------| | Filter results | `... \| where field > 100` | `... \| where field > 100` | Same syntax | | Compound filter | `... \| where field1=200 OR field2=203` | `... \| where field1=200 or field2=203` | Same syntax | + - -## Aggregation - +## Aggregation + | Operation | Splunk SPL | OpenSearch PPL | Notes | |-----------|------------|---------------|-------| | Count | `... \| stats count` | `... \| stats count` | Same syntax | @@ -105,9 +105,9 @@ This table provides a mapping between Splunk SPL commands and their OpenSearch P | Distinct count | `... \| stats dc(field)` | `... \| stats dc(field)` | Same syntax | | Min/Max | `... \| stats min(field), max(field)` | `... \| stats min(field), max(field)` | Same syntax | | Percentiles | `... \| stats perc95(field)` | `... \| stats perc95(field)` | Same syntax | - -## Sorting and Limiting - + +## Sorting and Limiting + | Operation | Splunk SPL | OpenSearch PPL | Notes | |-----------|------------|---------------|-------| | Sort ascending | `... \| sort field` | `... \| sort field` | Same syntax | @@ -115,9 +115,9 @@ This table provides a mapping between Splunk SPL commands and their OpenSearch P | Sort multiple | `... \| sort field1, -field2` | `... \| sort field1, -field2` | Same syntax | | Limit results | `... \| head 10` | `... \| head 10` | Same syntax | | Get last results | `... \| tail 10` | `... \| tail 10` | Same syntax | - -## Rex vs Parse - + +## Rex vs Parse + | Operation | Splunk SPL | OpenSearch PPL | Notes | |-----------|------------|---------------|-------| | Basic extraction | `... \| rex field=address "(?\d+) (?.+)"` | `... \| rex address "(?\d+) (?.+)"` | Same syntax | @@ -125,24 +125,24 @@ This table provides a mapping between Splunk SPL commands and their OpenSearch P | Search and replace mode | `... \| rex field=address mode=sed "s/\d+//g"` | `... \| rex field=address mode=sed "s/\d+//g"` | Same syntax | | Field override | `... \| rex field=address "(?
.+)"` | `... \| rex address "(?
.+)"` | Same syntax | | Default field (_raw) | `... \| rex "(?\d+) (?.+)"` | Not supported | PPL does not support implicit _raw field and requires explicit field specification | - -## Time Functions - + +## Time Functions + | Operation | Splunk SPL | OpenSearch PPL | Notes | |-----------|------------|---------------|-------| | Relative time | `earliest=-1d latest=now()` | `earliest("-1d", @timestamp) and latest("now", @timestamp)` | PPL supports earliest() and latest() functions | | Time extraction | `... \| eval hour=strftime(now(), "%H")` | `... \| eval hour = strftime(now(), '%H')` | Same syntax | | Time bucket | `... \| bin _time span=5m \| stats count by _time` | `... \| stats count by span(@timestamp, 5m)` | PPL uses `span()` | - -## Dedup - + +## Dedup + | Operation | Splunk SPL | OpenSearch PPL | Notes | |-----------|------------|---------------|-------| | Deduplicate | `... \| dedup field1, field2` | `... \| dedup field1, field2` | Same syntax | | Deduplicate with count | `... \| dedup 2 field1` | `... \| dedup 2 field1` | Same syntax | - -## Lookup and Joins - + +## Lookup and Joins + | Operation | Splunk SPL | OpenSearch PPL | Notes | |-----------|------------|---------------|-------| | Basic lookup | `... \| lookup vendors product_id` | `... \| lookup vendors product_id` | Same syntax | @@ -154,25 +154,25 @@ This table provides a mapping between Splunk SPL commands and their OpenSearch P | Left join | `... \| join type=left vendors [search index=vendors]` | `... \| left join vendors` | Different syntax format | | Join with ON clause | `... \| join type=inner left=a right=b where a.id = b.id vendors` | `... \| inner join left=a right=b ON a.id = b.id vendors` | PPL uses "ON" instead of "where" | | Append columns | `... \| appendcols [search source=other_index \| fields id, status]` | `... \| appendcols [source=other_index \| fields id, status]` | Similar syntax | - -## Field Manipulation - + +## Field Manipulation + | Operation | Splunk SPL | OpenSearch PPL | Notes | |-----------|------------|---------------|-------| | Include fields | `... \| fields field1, field2` | `... \| fields field1, field2` | Same syntax | | Exclude fields | `... \| fields - field3` | `... \| fields - field3` | Same syntax | | Rename fields | `... \| rename field1 as new_name` | `... \| rename field1 as new_name` | PPL uses lowercase "as" | | Replace null values | `... \| fillnull value=0 field1, field2` | `... \| fillnull with 0 in field1, field2` | Similar syntax but different format | - -## Handling Null Values - + +## Handling Null Values + | Operation | Splunk SPL | OpenSearch PPL | Notes | |-----------|------------|---------------|-------| | Basic null replacement | `... \| fillnull value=0 field1` | `... \| fillnull with 0 in field1` | Similar syntax but uses `with...in` format | | Multiple fields | `... \| fillnull value="N/A" field1, field2, field3` | `... \| fillnull with 'N/A' in field1, field2, field3` | Similar syntax but uses `with...in` format | - -## Results Limiting - + +## Results Limiting + | Operation | Splunk SPL | OpenSearch PPL | Notes | |-----------|------------|---------------|-------| | First N results | `... \| head 10` | `... \| head 10` | Same syntax | @@ -180,9 +180,9 @@ This table provides a mapping between Splunk SPL commands and their OpenSearch P | Moving average | `... \| trendline sma5(value)` | `... \| trendline sma5(value)` | Same syntax | | Top values | `... \| top 10 field` | `... \| top 10 field` | Same syntax | | Rare values | `... \| rare 10 field` | `... \| rare 10 field` | Same syntax | - -## String Functions - + +## String Functions + | Operation | Splunk SPL | OpenSearch PPL | Notes | |-----------|------------|---------------|-------| | String concatenation | `... \| eval result=field1 + " " + field2` | `... \| eval result = concat(field1, ' ', field2)` | PPL requires `concat()` function | @@ -193,9 +193,9 @@ This table provides a mapping between Splunk SPL commands and their OpenSearch P | Replace | `... \| eval result=replace(field, "pattern", "replacement")` | `... \| eval result = replace(field, 'pattern', 'replacement')` | Same syntax | | Trim whitespace | `... \| eval result=trim(field)` | `... \| eval result = trim(field)` | Same syntax | | Contains (wildcard) | `... \| eval result=like(field, "%pattern%")` | `... \| eval result = like(field, '%pattern%')` | Same syntax | - -## Conditional Functions - + +## Conditional Functions + | Operation | Splunk SPL | OpenSearch PPL | Notes | |-----------|------------|---------------|-------| | If condition | `... \| eval result=if(field > 100, "High", "Low")` | `... \| eval result = if(field > 100, 'High', 'Low')` | Same syntax | @@ -203,9 +203,9 @@ This table provides a mapping between Splunk SPL commands and their OpenSearch P | NULL check | `... \| eval result=if(isnull(field), "Missing", field)` | `... \| eval result = if(isnull(field), 'Missing', field)` | Same syntax | | Empty check | `... \| eval result=if(isnotnull(field), field, "Default")` | `... \| eval result = if(isnotnull(field), field, 'Default')` | Same syntax | | Coalesce (first non-null) | `... \| eval result=coalesce(field1, field2, "default")` | `... \| eval result = coalesce(field1, field2, 'default')` | Same syntax | - -## Math Functions - + +## Math Functions + | Operation | Splunk SPL | OpenSearch PPL | Notes | |-----------|------------|---------------|-------| | Addition | `... \| eval sum=field1 + field2` | `... \| eval sum = field1 + field2` | Same syntax | @@ -219,9 +219,9 @@ This table provides a mapping between Splunk SPL commands and their OpenSearch P | Floor | `... \| eval result=floor(field)` | `... \| eval result = floor(field)` | Same syntax | | Power | `... \| eval result=pow(field, 2)` | `... \| eval result = pow(field, 2)` | Same syntax | | Square root | `... \| eval result=sqrt(field)` | `... \| eval result = sqrt(field)` | Same syntax | - -## Date and Time Functions - + +## Date and Time Functions + | Operation | Splunk SPL | OpenSearch PPL | Notes | |-----------|------------|---------------|-------| | Current time | `... \| eval now=now()` | `... \| eval now = now()` | Same syntax | @@ -230,11 +230,12 @@ This table provides a mapping between Splunk SPL commands and their OpenSearch P | Day ago | `... \| eval yesterday=relative_time(now(), "-1d")` | `... \| eval yesterday = date_sub(now(), INTERVAL 1 DAY)` | PPL uses interval syntax | | Day ahead | `... \| eval tomorrow=relative_time(now(), "+1d")` | `... \| eval tomorrow = date_add(now(), INTERVAL 1 DAY)` | PPL uses interval syntax | | Time difference | `... \| eval diff=(_time2 - _time1)` | `... \| eval diff = date_diff('second', timestamp1, timestamp2)` | PPL uses function | - -## Other Functions - + +## Other Functions + | Operation | Splunk SPL | OpenSearch PPL | Notes | |-----------|------------|---------------|-------| | MD5 hash | Not native | `... \| eval hash = md5('string')` | PPL-specific feature | | SHA1 hash | Not native | `... \| eval hash = sha1('string')` | PPL-specific feature | | JSON extraction | `... \| spath input=data path=user.name output=username` | `... \| eval username = json_extract(data, '$.user.name')` | Different approach | + \ No newline at end of file diff --git a/doctest/markdown_parser.py b/doctest/markdown_parser.py new file mode 100644 index 00000000000..00ea5441d49 --- /dev/null +++ b/doctest/markdown_parser.py @@ -0,0 +1,286 @@ +""" +Markdown-based doctest parser for clean copy-paste documentation. + +Parses Markdown code fences instead of RST directives. +""" + +import inspect +import re +from pathlib import Path +from typing import Callable, List, Optional, Tuple, Union +import doctest + + +class MarkdownDocTestParser: + """ + Parses Markdown files looking for paired code blocks: + 1. Input code block (sql, ppl, sh, bash, bash ppl) + 2. Output code block (text, console, json, yaml, or output) + + Example Markdown format: + + ```sql + SELECT * FROM accounts + ``` + + ```text + +------+ + | name | + +------+ + | John | + +------+ + ``` + """ + + # Regex to match Markdown code fences with optional attributes + CODE_FENCE_PATTERN = re.compile( + r'^```(\w+)([^\n]*?)\s*\n' # ```language [attributes] (no newlines in attributes) + r'(.*?)' # code content (non-greedy) + r'^```\s*$', # closing ``` + re.MULTILINE | re.DOTALL + ) + + def __init__(self, input_languages: Optional[List[str]] = None, + output_languages: Optional[List[str]] = None, + transform: Optional[Callable] = None) -> None: + """ + Args: + input_languages: List of languages for input blocks (e.g., ['sql', 'ppl']) + output_languages: List of languages for output blocks (e.g., ['text', 'console']) + transform: Function to transform input code before execution + """ + self.input_languages = input_languages or ['sql', 'ppl', 'bash', 'sh', 'bash ppl'] + self.output_languages = output_languages or ['text', 'console', 'output', 'json', 'yaml'] + self.transform = transform or (lambda x: x) + + def parse(self, text: str, name: str = '') -> doctest.DocTest: + """ + Parse Markdown text and extract test cases from code fence pairs. + + Returns a DocTest object compatible with doctest.DocTestRunner. + """ + examples = [] + blocks = self._extract_code_blocks(text) + + # Find pairs of input/output blocks + i = 0 + while i < len(blocks) - 1: + lang1, code1, lineno1 = blocks[i] + lang2, code2, lineno2 = blocks[i + 1] + + # Check if this is an input/output pair + if lang1 in self.input_languages and lang2 in self.output_languages: + # Create a doctest example + source = code1.rstrip('\n') + want = code2.rstrip('\n') + '\n' # doctest expects trailing newline + + # Apply transform to source + if callable(self.transform): + # Check if transform accepts language parameter + sig = inspect.signature(self.transform) + if len(sig.parameters) > 1: + transformed_source = self.transform(source, lang1) + else: + transformed_source = self.transform(source) + else: + transformed_source = source + + example = doctest.Example( + source=transformed_source, + want=want, + lineno=lineno1, + indent=0, + options={} + ) + examples.append(example) + + # Skip the output block since we've paired it + i += 2 + else: + # Not a pair, move to next block + i += 1 + + return doctest.DocTest( + examples=examples, + globs={}, + name=name, + filename=name, + lineno=0, + docstring=text + ) + + def get_doctest(self, docstring: str, globs: dict, name: str, filename: str, lineno: int) -> doctest.DocTest: + """ + Extract a DocTest object from the given docstring. + This method is required for compatibility with DocFileSuite. + """ + # Read the file content + content = Path(filename).read_text(encoding='utf-8') + + # Parse the markdown content and update globs + doctest_obj = self.parse(content, name=filename) + doctest_obj.globs.update(globs) + return doctest_obj + + def _extract_code_blocks(self, text: str) -> List[Tuple[str, str, int]]: + """ + Extract all code blocks from Markdown text, skipping those with 'ignore' attribute. + + Returns list of (language, code, line_number) tuples. + """ + blocks = [] + for match in self.CODE_FENCE_PATTERN.finditer(text): + language = match.group(1).lower() + attributes = match.group(2) or "" + code = match.group(3) + lineno = text[:match.start()].count('\n') + 1 + + # Skip blocks with 'ignore' attribute + if "ignore" in attributes: + continue + + blocks.append((language, code, lineno)) + + return blocks + + +def create_markdown_suite(filepath: Union[str, Path], transform: Optional[Callable] = None, + setup: Optional[Callable] = None, globs: Optional[dict] = None) -> doctest.DocTestSuite: + """ + Create a test suite from a Markdown file. + + Args: + filepath: Path to Markdown file + transform: Function to transform input code + setup: Setup function to run before tests + globs: Global variables for test execution + + Returns: + doctest.DocTestSuite + """ + parser = MarkdownDocTestParser(transform=transform) + + content = Path(filepath).read_text(encoding='utf-8') + + doctest_obj = parser.parse(content, name=str(filepath)) + + # Set up globs if provided + if globs: + doctest_obj.globs.update(globs) + + # Create a test case + test = doctest.DocTestCase( + doctest_obj, + optionflags=doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS, + setUp=setup + ) + + return doctest.DocTestSuite(test_finder=lambda: [doctest_obj]) + + +# Transform functions for different languages +def sql_markdown_transform(code: str, lang: str = "sql") -> str: + """Transform SQL code for execution.""" + return f'sql_cmd.process({repr(code.strip().rstrip(";"))})' + + +def ppl_markdown_transform(code: str, lang: str = "ppl") -> str: + """Transform PPL code for execution.""" + # Join multi-line PPL queries into a single line + # Remove leading/trailing whitespace and join lines with space + single_line = " ".join( + line.strip() for line in code.strip().split("\n") if line.strip() + ) + return f'ppl_cmd.process({repr(single_line.rstrip(";"))})' + + +def bash_markdown_transform(code: str, lang: str = "bash") -> str: + """Transform bash code for execution.""" + if code.strip().startswith("opensearchsql"): + match = re.search(r'opensearchsql\s+-q\s+"(.*?)"', code) + if match: + query = match.group(1) + return f'cmd.process({repr(query.strip().rstrip(";"))})' + return f'pretty_print(sh("""{code}""").stdout.decode("utf-8"))' + + +def bash_ppl_markdown_transform(code: str, lang: str = "bash ppl") -> str: + """Transform bash ppl code for execution (curl commands with PPL queries).""" + return f'pretty_print(sh("""{code}""").stdout.decode("utf-8"))' + + +def mixed_ppl_transform(code: str, lang: str = "ppl") -> str: + """Mixed transform that handles both ppl and bash ppl.""" + if lang == "bash ppl" or "curl" in code.lower(): + return bash_ppl_markdown_transform(code, lang) + else: + return ppl_markdown_transform(code, lang) + + +def detect_markdown_format(filepath: Union[str, Path]) -> bool: + """ + Check if a file uses Markdown code fences. + + Returns: + True if file uses ```language``` code fences + False otherwise + """ + content = Path(filepath).read_text(encoding='utf-8') + + # Check for Markdown code fences + return bool(re.search(r'^```\w+\s*\n', content, re.MULTILINE)) + + +def create_hybrid_markdown_suite(filepaths: List[Union[str, Path]], doc_type: str, + setup_func: Optional[Callable] = None) -> doctest.DocTestSuite: + """ + Create test suite for Markdown files. + + Args: + filepaths: List of Markdown file paths + doc_type: 'sql', 'ppl', or 'bash' + setup_func: Setup function to initialize test environment + + Returns: + doctest.DocTestSuite + """ + # Choose transform based on doc type + if 'sql' in doc_type: + transform = sql_markdown_transform + input_langs = ['sql'] + elif 'ppl' in doc_type: + transform = ppl_markdown_transform + input_langs = ['ppl'] + else: # bash + transform = bash_markdown_transform + input_langs = ['bash', 'sh'] + + parser = MarkdownDocTestParser( + input_languages=input_langs, + output_languages=['text', 'console', 'output'], + transform=transform + ) + + all_tests = [] + + for filepath in filepaths: + content = Path(filepath).read_text(encoding='utf-8') + + doctest_obj = parser.parse(content, name=str(filepath)) + + # Only add if there are examples + if doctest_obj.examples: + all_tests.append(doctest_obj) + + # Create test suite + def setUp(test): + if setup_func: + setup_func(test) + + suite = doctest.DocTestSuite( + test_finder=lambda: all_tests, + setUp=setUp, + optionflags=doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS + ) + + return suite diff --git a/doctest/test_docs.py b/doctest/test_docs.py index d3cea5782b5..e57c41d6827 100644 --- a/doctest/test_docs.py +++ b/doctest/test_docs.py @@ -13,6 +13,7 @@ import unittest from concurrent.futures import ThreadPoolExecutor, as_completed from functools import partial +from typing import List import click import zc.customdoctests @@ -20,6 +21,15 @@ from opensearch_sql_cli.opensearch_connection import OpenSearchConnection from opensearch_sql_cli.utils import OutputSettings from opensearchpy import OpenSearch, helpers +from markdown_parser import mixed_ppl_transform + + +# Import Markdown parser +from markdown_parser import ( + MarkdownDocTestParser, + ppl_markdown_transform, + sql_markdown_transform, +) ENDPOINT = "http://localhost:9200" @@ -100,11 +110,11 @@ def requires_calcite(doc_category): class CategoryManager: - + def __init__(self, category_file_path='../docs/category.json'): self._categories = self.load_categories(category_file_path) self._all_docs_cache = None - + def load_categories(self, file_path): try: with open(file_path) as json_file: @@ -117,48 +127,87 @@ def load_categories(self, file_path): for category, docs in categories.items() } debug(f"Loaded {len(categories)} categories from {file_path}") + + # Validate markdown-only categories + for category_name in categories.keys(): + self._validate_category_files(category_name, categories[category_name]) + return categories except Exception as e: raise Exception(f"Failed to load categories from {file_path}: {e}") - + + def _validate_category_files(self, category_name, docs): + """Internal method to validate category files during loading.""" + if self.is_markdown_category(category_name): + # Markdown-only categories should not contain .rst files + rst_files = [doc for doc in docs if doc.endswith(".rst")] + if rst_files: + raise Exception( + f"Only markdown files supported for category: {category_name}" + ) + debug( + f"Category {category_name} validation passed - all files are markdown" + ) + else: + # Non-markdown categories should only contain .rst files + md_files = [doc for doc in docs if doc.endswith(".md")] + if md_files: + raise Exception( + f"Only .rst files supported for category: {category_name}. Markdown not yet supported." + ) + debug(f"Category {category_name} validation passed - all files are .rst") + def get_all_categories(self): return list(self._categories.keys()) - + def get_category_files(self, category_name): return self._categories.get(category_name, []) - + def get_all_docs(self): if self._all_docs_cache is None: self._all_docs_cache = [] for category_name, docs in self._categories.items(): self._all_docs_cache.extend(docs) return self._all_docs_cache - + def find_file_category(self, file_path): # Convert to relative path from docs root if file_path.startswith('../docs/'): rel_path = file_path[8:] # Remove '../docs/' prefix else: rel_path = file_path - + for category_name, docs in self._categories.items(): if rel_path in docs: debug(f"Found file {rel_path} in category {category_name}") return category_name - + # Fallback to path-based detection debug(f"File {rel_path} not found in categories, using path-based detection") return detect_doc_type_from_path(file_path) - + def requires_calcite(self, category_name): - return category_name.endswith('_calcite') - + return category_name.endswith("_calcite") + + def is_markdown_category(self, category_name): + """Check if category uses Markdown files.""" + return category_name in ("ppl_cli_calcite", "bash_calcite", "bash_settings") + + def validate_category_files(self, category_name): + """Validate that categories contain only the correct file types. + + Markdown categories should only contain .md files. + Non-markdown categories should only contain .rst files. + """ + docs = self.get_category_files(category_name) + self._validate_category_files(category_name, docs) + def get_setup_function(self, category_name): if self.requires_calcite(category_name): return set_up_test_indices_with_calcite else: return set_up_test_indices_without_calcite - + def get_parser_for_category(self, category_name): if category_name.startswith('bash'): return bash_parser @@ -169,11 +218,21 @@ def get_parser_for_category(self, category_name): else: # Default fallback return sql_cli_parser - + def find_matching_files(self, search_filename): - if not search_filename.endswith('.rst'): - search_filename += '.rst' - + # Support both .rst and .md extensions + if not search_filename.endswith(".rst") and not search_filename.endswith(".md"): + # Try both extensions + all_docs = self.get_all_docs() + matches = [ + doc + for doc in all_docs + if doc.endswith(search_filename + ".rst") + or doc.endswith(search_filename + ".md") + or doc.endswith(search_filename) + ] + return matches + all_docs = self.get_all_docs() matches = [doc for doc in all_docs if doc.endswith(search_filename)] return matches @@ -189,22 +248,33 @@ def __init__(self, query_language="sql", endpoint=ENDPOINT): def process(self, statement): debug(f"Executing {self.query_language.upper()} query: {statement}") - - data = self.execute_query(statement, use_console=False) - debug(f"Query result: {data}") - - if data is None: - debug("Query returned None - this may indicate an error or unsupported function") - print("Error: Query returned no data") - return - - output = self.formatter.format_output(data) - output = "\n".join(output) - click.echo(output) + + try: + data = self.execute_query(statement, use_console=False) + debug(f"Query result: {data}") + + if data is None: + debug( + "Query returned None - this may indicate an error or unsupported function" + ) + print("Error: Query returned no data") + return + + output = self.formatter.format_output(data) + output = "\n".join(output) + click.echo(output) + except Exception as e: + # Print detailed error information + print(f"Error executing query: {statement}") + print(f"Error type: {type(e).__name__}") + print(f"Error message: {str(e)}") + if hasattr(e, "info"): + print(f"Error info: {e.info}") + raise class CalciteManager: - + @staticmethod def set_enabled(enabled): import requests @@ -216,16 +286,17 @@ def set_enabled(enabled): response = requests.put(f"{ENDPOINT}/_plugins/_query/settings", json=calcite_settings, timeout=10) - + if response.status_code != 200: raise Exception(f"Failed to set Calcite setting: {response.status_code} {response.text}") -class TestDataManager: - + +class DataManager: + def __init__(self): - self.client = OpenSearch([ENDPOINT], verify_certs=True) + self.client = OpenSearch([ENDPOINT], verify_certs=True, timeout=60) self.is_loaded = False - + def load_file(self, filename, index_name): mapping_file_path = './test_mapping/' + filename if os.path.isfile(mapping_file_path): @@ -298,7 +369,7 @@ def bash_transform(s): def get_test_data_manager(): global test_data_manager if test_data_manager is None: - test_data_manager = TestDataManager() + test_data_manager = DataManager() return test_data_manager @@ -365,12 +436,70 @@ def create_cli_suite(filepaths, parser, setup_func): setUp=setup_func ) + +def create_markdown_suite(filepaths, category_name, setup_func): + """ + Create test suite for Markdown files. + + Args: + filepaths: List of Markdown file paths + category_name: Category name (e.g., 'ppl_cli_calcite') + setup_func: Setup function to initialize test environment + + Returns: + doctest.DocTestSuite + """ + + # Determine transform based on category + if "sql" in category_name: + transform = sql_markdown_transform + input_langs = ["sql"] + elif "ppl" in category_name: + transform = mixed_ppl_transform + input_langs = ["ppl", "bash ppl"] + elif "bash" in category_name: + transform = mixed_ppl_transform + input_langs = ["bash", "bash ppl", "sh"] + else: + # Default to PPL + transform = mixed_ppl_transform + input_langs = ["ppl", "sql", "bash ppl"] + + parser = MarkdownDocTestParser( + input_languages=input_langs, + output_languages=["text", "console", "output", "json", "yaml"], + transform=transform, + ) + + # Prepare globs for bash commands + test_globs = {} + if "bash" in category_name: + test_globs = { + "sh": partial( + subprocess.run, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + timeout=60, + shell=True, + ), + "pretty_print": pretty_print, + } + + return docsuite( + *filepaths, + parser=parser, + setUp=setup_func, + globs=test_globs, + ) + + # Entry point for unittest discovery def load_tests(loader, suite, ignore): tests = [] settings_tests = [] category_manager = CategoryManager() - + for category_name in category_manager.get_all_categories(): docs = category_manager.get_category_files(category_name) if not docs: @@ -381,7 +510,6 @@ def load_tests(loader, suite, ignore): settings_tests.append(suite) else: tests.append(suite) - random.shuffle(tests) if settings_tests: random.shuffle(settings_tests) @@ -391,7 +519,9 @@ def load_tests(loader, suite, ignore): def get_test_suite(category_manager: CategoryManager, category_name, filepaths): setup_func = category_manager.get_setup_function(category_name) - if category_name.startswith('bash'): + if category_manager.is_markdown_category(category_name): + return create_markdown_suite(list(filepaths), category_name, setup_func) + elif category_name.startswith("bash"): return create_bash_suite(filepaths, setup_func) else: parser = category_manager.get_parser_for_category(category_name) @@ -399,21 +529,21 @@ def get_test_suite(category_manager: CategoryManager, category_name, filepaths): def list_available_docs(category_manager: CategoryManager): categories = category_manager.get_all_categories() - + print(f"Available documentation files for testing:\n") - + total = 0 - for category_name in categories.items(): + for category_name in categories: files = category_manager.get_category_files(category_name) total += len(files) print(f"{category_name} docs ({len(files)} files):\n") for doc in sorted(files): print(f" ../docs/{doc}\n") - + print(f"Total: {total} documentation files available for testing\n") -def resolve_files(category_manager: CategoryManager, file_paths: list[str]): +def resolve_files(category_manager: CategoryManager, file_paths: List[str]): result = [] for file_param in file_paths: resolved_files = category_manager.find_matching_files(file_param) @@ -444,7 +574,7 @@ def main(): - If a filename matches multiple files, all matches will be executed """ ) - + parser.add_argument('file_paths', nargs='*', help='Path(s) to the documentation file(s) to test') parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output with detailed diff information') @@ -452,14 +582,14 @@ def main(): help='Custom OpenSearch endpoint (default: http://localhost:9200)') parser.add_argument('--list', '-l', action='store_true', help='List all available documentation files') - + args = parser.parse_args() category_manager = CategoryManager() if args.list: list_available_docs(category_manager) return - + if not args.file_paths: print("No specific files provided. Running full doctest suite...") unittest.main(module=None, argv=['test_docs.py'], exit=False) @@ -467,8 +597,8 @@ def main(): if args.endpoint: global ENDPOINT - ENDPOINT = endpoint - print(f"Using custom endpoint: {endpoint}") + ENDPOINT = args.endpoint + print(f"Using custom endpoint: {args.endpoint}") all_files_to_test = resolve_files(category_manager, args.file_paths) @@ -483,5 +613,6 @@ def main(): sys.exit(0 if all_success else 1) + if __name__ == '__main__': main() diff --git a/scripts/docs_exporter/convert_rst_to_md.py b/scripts/docs_exporter/convert_rst_to_md.py new file mode 100644 index 00000000000..2a27b179fe6 --- /dev/null +++ b/scripts/docs_exporter/convert_rst_to_md.py @@ -0,0 +1,536 @@ +#!/usr/bin/env python3 +""" +Convert RST PPL documentation to Markdown format. + +This script converts RST files with os> prompts to Markdown with clean code fences. +""" + +import re +import sys +from pathlib import Path +from typing import Dict, List, Optional, Tuple + + +def convert_rst_table_to_markdown(table_lines: List[str]) -> Optional[str]: + """Convert RST grid table to Markdown table.""" + # Extract rows (lines starting with |) + rows = [line for line in table_lines if line.strip().startswith('|')] + + if len(rows) < 2: + return None + + # Parse cells from each row + parsed_rows = [] + for row in rows: + # Split by | and clean up + cells = [cell.strip() for cell in row.split('|')[1:-1]] + parsed_rows.append(cells) + + # Build markdown table + md_table = [] + + # Header row + md_table.append('| ' + ' | '.join(parsed_rows[0]) + ' |') + + # Separator row + md_table.append('| ' + ' | '.join(['---'] * len(parsed_rows[0])) + ' |') + + # Data rows + for row in parsed_rows[1:]: + md_table.append('| ' + ' | '.join(row) + ' |') + + return '\n'.join(md_table) + + +def convert_inline_code(text: str) -> str: + """Convert RST inline code ``code`` to Markdown `code`.""" + # Special case: ```*``` (three backticks) renders as `*` with backticks visible + # Convert ```*``` to `` `*` `` in Markdown + text = re.sub(r'```([^`]+)```', r'`` `\1` ``', text) + + # Convert regular ``code`` to `code` + text = re.sub(r'``([^`]+)``', r'`\1`', text) + return text + + +def convert_links(text: str) -> str: + """Convert RST links to Markdown links.""" + # Convert `link text `_ to [link text](url) + # Also convert .rst to .md for internal links + def replace_link(match): + link_text = match.group(1).strip() + url = match.group(2) + # Convert .rst to .md for all links (including GitHub URLs and anchors) + url = re.sub(r'\.rst(#|$)', r'.md\1', url) + return f'[{link_text}]({url})' + + # More specific regex: match backtick, non-greedy text, space, , backtick, underscore + text = re.sub(r'`([^`<]+?)\s*<([^>]+)>`_', replace_link, text) + + # Convert section references `Section Name`_ to [Section Name](#section-name) + # These are internal anchor links to headings in the same document + def replace_section_ref(match): + section_name = match.group(1) + # Convert to lowercase and replace spaces with hyphens for anchor + # Keep underscores as they are (don't convert to hyphens) + anchor = section_name.lower().replace(' ', '-') + # Remove special characters that aren't valid in anchors (but keep underscores and hyphens) + anchor = re.sub(r'[^\w\-]', '', anchor) + return f'[{section_name}](#{anchor})' + + text = re.sub(r'`([^`<]+)`_', replace_section_ref, text) + + return text + + +def convert_heading(line: str, next_line: Optional[str], heading_map: Dict[str, int]) -> Optional[str]: + """Convert RST heading to Markdown. + + Args: + line: The heading text + next_line: The underline + heading_map: Dict mapping underline chars to heading levels + + Returns: + Markdown heading string or None + """ + if not next_line: + return None + + # Detect underline character + underline_char = None + if re.match(r'^=+$', next_line): + underline_char = '=' + elif re.match(r'^-+$', next_line): + underline_char = '-' + elif re.match(r'^\^+$', next_line): + underline_char = '^' + elif re.match(r'^~+$', next_line): + underline_char = '~' + elif re.match(r'^>+$', next_line): + underline_char = '>' + elif re.match(r'^`+$', next_line): + underline_char = '`' + + if underline_char: + # Assign level based on first appearance (after title) + if underline_char not in heading_map: + # Start at H2 (H1 is reserved for title with overline) + heading_map[underline_char] = len(heading_map) + 2 + + level = heading_map[underline_char] + prefix = '#' * level + return f"{prefix} {line}\n" + + return None + + +def parse_os_block(block_content: str) -> Tuple[str, str]: + """Parse a block with os>, PPL>, or > prompts into command and output.""" + lines = block_content.strip().split('\n') + + command_lines = [] + output_lines = [] + in_output = False + + for line in lines: + # Handle os>, PPL>, OS>, and > prompts (case-insensitive for os/ppl) + line_lower = line.lower() + if line_lower.startswith('os> ') or line_lower.startswith('ppl> ') or line.startswith('> '): + # Command line - remove prompt prefix and ; suffix + if line_lower.startswith('os> '): + cmd = line[4:].rstrip(';').strip() + elif line_lower.startswith('ppl> '): + cmd = line[5:].rstrip(';').strip() + else: # > + cmd = line[2:].rstrip(';').strip() + command_lines.append(cmd) + elif line.strip() == '': + # Blank line separates command from output + if command_lines and not in_output: + in_output = True + else: + # Output line + in_output = True + output_lines.append(line) + + # Format command with pipes on separate lines + if command_lines: + command = command_lines[0] + # Split on | and format nicely + if '|' in command: + parts = [p.strip() for p in command.split('|')] + command = parts[0] + '\n' + '\n'.join(f'| {p}' for p in parts[1:]) + else: + command = '' + + return command, '\n'.join(output_lines) + + +def split_multiple_queries(block_content: str) -> List[str]: + """Split a block with multiple os>/PPL>/OS> queries into separate blocks.""" + lines = block_content.strip().split('\n') + blocks = [] + current_block = [] + + for line in lines: + # Check if this is a new query prompt (case-insensitive) + line_lower = line.lower() + if (line_lower.startswith('os> ') or line_lower.startswith('ppl> ') or line.startswith('> ')) and current_block: + # Save the previous block + blocks.append('\n'.join(current_block)) + current_block = [line] + else: + current_block.append(line) + + # Don't forget the last block + if current_block: + blocks.append('\n'.join(current_block)) + + return blocks if len(blocks) > 1 else [block_content] + + +def convert_code_block_to_markdown(block_content: str) -> Optional[str]: + """Convert RST code block to Markdown code fences.""" + # Check if there are multiple queries in this block + query_blocks = split_multiple_queries(block_content) + + md_parts = [] + for query_block in query_blocks: + command, output = parse_os_block(query_block) + + if not command: + continue + + # Create Markdown code fences + md = f"```ppl\n{command}\n```\n" + + if output: + md += f"\nExpected output:\n\n```text\n{output}\n```" + + md_parts.append(md) + + return '\n\n'.join(md_parts) if md_parts else None + + +def convert_rst_to_markdown(rst_content: str) -> str: + """Convert RST content to Markdown.""" + lines = rst_content.split('\n') + md_lines = [] + i = 0 + skip_next = False + heading_map: Dict[str, int] = {} # Track underline char to heading level mapping + + while i < len(lines): + if skip_next: + skip_next = False + i += 1 + continue + + line = lines[i] + next_line = lines[i + 1] if i + 1 < len(lines) else None + prev_line = lines[i - 1] if i > 0 else None + + # Check for title with overline (e.g., ====\nTitle\n====) + if (prev_line and next_line and + re.match(r'^=+$', prev_line.strip()) and + re.match(r'^=+$', next_line.strip()) and + len(prev_line.strip()) == len(next_line.strip())): + # This is a title (H1) with overline + md_lines.append(f"# {line}\n") + skip_next = True + i += 1 + continue + + # Skip RST artifacts (standalone underlines) + if line.strip() and re.match(r'^[=\-\^~]+$', line.strip()): + i += 1 + continue + + # Check for headings (underline only) + heading = convert_heading(line, next_line, heading_map) + if heading: + md_lines.append(heading) + skip_next = True + i += 1 + continue + + # Check for RST grid tables (lines starting with +---+) + if line.strip().startswith('+') and '-' in line and '+' in line: + # Collect the entire table + table_lines = [] + j = i + while j < len(lines) and (lines[j].strip().startswith('+') or lines[j].strip().startswith('|')): + table_lines.append(lines[j]) + j += 1 + + if table_lines: + # Convert RST table to markdown + md_table = convert_rst_table_to_markdown(table_lines) + if md_table: + md_lines.append(md_table + '\n') + i = j + continue + + # Check for list-table directive + if line.strip().startswith('.. list-table::'): + # Extract table caption if present + caption = line.strip()[15:].strip() + i += 1 + + # Skip options (like :widths:, :header-rows:) + while i < len(lines) and lines[i].strip().startswith(':'): + i += 1 + + # Skip blank line + if i < len(lines) and lines[i].strip() == '': + i += 1 + + # Collect table rows (lines starting with * -) + table_rows = [] + current_row = [] + + while i < len(lines): + line_content = lines[i] + + # New row starts with * - + if line_content.strip().startswith('* -'): + if current_row: + table_rows.append(current_row) + current_row = [line_content.strip()[3:].strip()] + i += 1 + # Continuation of cell (starts with - or indented) + elif line_content.strip().startswith('- ') and current_row: + current_row.append(line_content.strip()[2:].strip()) + i += 1 + # End of table + elif line_content.strip() == '' or not (line_content.startswith(' ') or line_content.strip().startswith('-')): + if current_row: + table_rows.append(current_row) + break + else: + i += 1 + + # Convert to markdown table + if table_rows: + if caption: + md_lines.append(f"{caption}\n\n") + + # Header row + md_lines.append('| ' + ' | '.join(table_rows[0]) + ' |') + md_lines.append('| ' + ' | '.join(['---'] * len(table_rows[0])) + ' |') + + # Data rows + for row in table_rows[1:]: + md_lines.append('| ' + ' | '.join(row) + ' |') + + md_lines.append('\n') + continue + + # Check for image directive + if line.strip().startswith('.. image::'): + image_url = line.strip()[10:].strip() + # Use the URL as alt text (can be improved if there's a :alt: option) + md_lines.append(f'![Image]({image_url})\n') + i += 1 + # Skip any image options (like :alt:, :width:, etc.) + while i < len(lines) and lines[i].strip().startswith(':'): + i += 1 + continue + + # Check for other RST directives to skip + if line.strip().startswith('..'): + # Skip directive and its options + while i < len(lines) and (lines[i].strip().startswith('..') or + lines[i].strip().startswith(':') or + lines[i].strip() == ''): + i += 1 + continue + + # Remove pipe prefix from description lines + if line.startswith('| '): + line = line[2:] + + # Convert links + line = convert_links(line) + + # Convert inline code + line = convert_inline_code(line) + + # Detect subsections (lines that look like subsection titles before code blocks) + if (i + 1 < len(lines) and + not line.startswith('#') and + line.strip() and + not line.strip().startswith('*') and + not line.strip().startswith('-') and + lines[i + 1].strip() and + not lines[i + 1].startswith(' ') and + len(line) < 80 and + (i + 2 < len(lines) and + (lines[i + 2].strip().startswith('The ') or + lines[i + 2].strip().startswith('This ')))): + # This looks like a subsection title + md_lines.append(f"### {line}\n") + i += 1 + continue + + # Check for RST directives + if line.strip().startswith('.. code-block::'): + # Extract language if present + match = re.match(r'\s*\.\. code-block::\s*(\w+)?', line) + lang = match.group(1) if match and match.group(1) else 'text' + + # Look ahead for indented block + j = i + 1 + # Skip blank line after directive + if j < len(lines) and lines[j].strip() == '': + j += 1 + + block_lines = [] + while j < len(lines) and (lines[j].startswith(' ') or lines[j].strip() == ''): + if lines[j].startswith(' '): + block_lines.append(lines[j][3:]) + else: + block_lines.append(lines[j]) + j += 1 + + if block_lines: + md_lines.append(f'```{lang}') + md_lines.extend(block_lines) + md_lines.append('```\n') + i = j + continue + + # Check for .. list-table:: directive + if line.strip().startswith('.. list-table::'): + # Skip the directive - tables need manual conversion + md_lines.append('**Table:**\n') + i += 1 + # Skip options and blank lines + while i < len(lines) and (lines[i].strip().startswith(':') or lines[i].strip() == ''): + i += 1 + continue + + # Check for .. note:: directive + if line.strip().startswith('.. note::'): + md_lines.append('> **Note:**') + i += 1 + # Get the note content (indented lines) + while i < len(lines) and (lines[i].startswith(' ') or lines[i].strip() == ''): + if lines[i].startswith(' '): + md_lines.append(f'> {lines[i][3:]}') + elif lines[i].strip(): + md_lines.append(f'> {lines[i]}') + else: + md_lines.append('>') + i += 1 + md_lines.append('') + continue + + # Check for code block marker (:: at end of line) + if line.strip().endswith('::'): + # Look ahead for indented block + j = i + 1 + # Skip blank line after :: + if j < len(lines) and lines[j].strip() == '': + j += 1 + + block_lines = [] + # Check for any indentation (tabs or spaces) + while j < len(lines) and (lines[j].startswith('\t') or lines[j].startswith(' ') or + lines[j].startswith(' ') or lines[j].startswith(' ') or + lines[j].startswith(' ') or lines[j].strip() == ''): + if lines[j].startswith('\t'): + block_lines.append(lines[j][1:]) + elif lines[j].startswith(' '): + block_lines.append(lines[j][4:]) + elif lines[j].startswith(' '): + block_lines.append(lines[j][3:]) + elif lines[j].startswith(' '): + block_lines.append(lines[j][2:]) + elif lines[j].startswith(' '): + block_lines.append(lines[j][1:]) + else: + block_lines.append(lines[j]) + j += 1 + + if block_lines: + block_content = '\n'.join(block_lines) + + # Check if it has os>, PPL>, or > prompts + if 'os>' in block_content or 'PPL>' in block_content or block_content.strip().startswith('>'): + md_block = convert_code_block_to_markdown(block_content) + if md_block: + # Add the description line before :: (if not "PPL query") + desc_line = line.rstrip(':').strip() + if desc_line and desc_line.lower() not in ['ppl query', 'query']: + md_lines.append(desc_line + '\n') + md_lines.append(md_block + '\n') + i = j + continue + else: + # Generic code block without prompts - wrap in markdown fence + desc_line = line.rstrip(':').strip() + if desc_line and desc_line.lower() not in ['example', 'result', 'result set']: + md_lines.append(desc_line + '\n') + md_lines.append('```bash\n' + block_content + '\n```\n') + i = j + continue + + # If no indented block found, just remove the :: + md_lines.append(line.rstrip(':').strip()) + i += 1 + continue + + # Regular line + if line.strip(): # Skip empty lines at the start + md_lines.append(line) + i += 1 + + return '\n'.join(md_lines) + + +def convert_file(rst_path: Path, md_path: Path) -> None: + """Convert a single RST file to Markdown.""" + print(f"Converting {rst_path} -> {md_path}") + + rst_content = rst_path.read_text(encoding='utf-8') + + # Convert + md_content = convert_rst_to_markdown(rst_content) + + # Write output + md_path.write_text(md_content, encoding='utf-8') + + print(f" ✓ Converted successfully") + + +def main() -> None: + if len(sys.argv) < 2: + print("Usage: python convert_rst_to_md.py [output_md_file]") + print(" or: python convert_rst_to_md.py --batch ") + sys.exit(1) + + if sys.argv[1] == '--batch': + # Batch convert all RST files in directory + directory = Path(sys.argv[2]) if len(sys.argv) > 2 else Path('docs/user/ppl/cmd') + + rst_files = list(directory.glob('*.rst')) + print(f"Found {len(rst_files)} RST files in {directory}") + + for rst_file in rst_files: + md_file = rst_file.with_suffix('.md') + try: + convert_file(rst_file, md_file) + except Exception as e: + print(f" ✗ Error: {e}") + else: + # Single file conversion + rst_file = Path(sys.argv[1]) + md_file = Path(sys.argv[2]) if len(sys.argv) > 2 else rst_file.with_suffix('.md') + + convert_file(rst_file, md_file) + + +if __name__ == '__main__': + main() diff --git a/scripts/docs_exporter/export_to_docs_website.py b/scripts/docs_exporter/export_to_docs_website.py new file mode 100755 index 00000000000..0b34984a006 --- /dev/null +++ b/scripts/docs_exporter/export_to_docs_website.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 +""" +Minimal markdown exporter for OpenSearch SQL documentation. +Exports docs/user/ppl to ../documentation-website/_search-plugins/sql/ +""" + +import re +from pathlib import Path +from typing import Optional + +def extract_title(content: str) -> Optional[str]: + """Extract title from first H1 heading or return None.""" + match = re.search(r'^#\s+(.+)$', content, re.MULTILINE) + return match.group(1).strip() if match else None + +def generate_frontmatter(title: Optional[str], parent: Optional[str] = None, nav_order: int = 1, has_children: bool = False) -> str: + """Generate Jekyll front-matter.""" + fm = ["---", "layout: default"] + if title: + fm.append(f"title: {title}") + if parent: + fm.append(f"parent: {parent}") + fm.append(f"nav_order: {nav_order}") + if has_children: + fm.append("has_children: true") + fm.append("---\n") + return "\n".join(fm) + +def process_file(source_file: Path, target_file: Path, parent: Optional[str] = None, nav_order: int = 1) -> None: + """Process a single markdown file.""" + content = source_file.read_text(encoding='utf-8') + + # Convert PPL code fences to SQL + content = re.sub(r'^```ppl\b.*$', '```sql', content, flags=re.MULTILINE) + + # Add copy buttons after code fences + content = re.sub(r'^```(bash|sh|sql)\b.*?\n(.*?)^```$', + r'```\1\n\2```\n{% include copy.html %}', + content, flags=re.MULTILINE | re.DOTALL) + + # Remove .md extension from relative links (keep http/https links unchanged) + content = re.sub(r'\]\((?!https?://)(.*?)\.md(#[^\)]*)?\)', r'](\1\2)', content) + + title = extract_title(content) or source_file.stem.replace('-', ' ').title() + + # Check if this directory has child markdown files in subdirectories + has_children = any(source_file.parent.glob('*/*.md')) + + frontmatter = generate_frontmatter(title, parent, nav_order, has_children) + + # Create target directory + target_file.parent.mkdir(parents=True, exist_ok=True) + + # Write file with front-matter + target_file.write_text(frontmatter + content, encoding='utf-8') + +def export_docs() -> None: + """Export PPL docs to documentation website.""" + source_dir = Path("../../docs/user/ppl") + target_dir = Path("../../../documentation-website/_sql-and-ppl/ppl-reference") + + if not source_dir.exists(): + print(f"Source directory {source_dir} not found") + return + + # Check if target directory exists and has files + if target_dir.exists() and any(target_dir.glob('**/*.md')): + response = input(f"Target directory {target_dir} contains files. Overwrite? (y/n): ") + if response.lower() != 'y': + print("Export cancelled") + return + + # Get all markdown files + md_files = list(source_dir.glob('**/*.md')) + + for i, md_file in enumerate(md_files, 1): + # Calculate relative path from source + rel_path = md_file.relative_to(source_dir) + target_file = target_dir / rel_path + + # Determine parent based on directory structure + parent = ( + "SQL and PPL" + if rel_path.parent == Path(".") + else rel_path.parent.name.replace("-", " ").title() + ) + + process_file(md_file, target_file, parent, i) + print(f"Exported: {md_file} -> {target_file}") + + # Generate index.md for each directory + dirs = set(md_file.relative_to(source_dir).parent for md_file in md_files) + for dir_path in sorted(dirs): + if dir_path == Path("."): + continue + target_index = target_dir / dir_path / "index.md" + title = dir_path.name.replace("-", " ").title() + parent = "Opensearch Ppl Reference Manual" if dir_path.parent == Path(".") else dir_path.parent.name.replace("-", " ").title() + frontmatter = generate_frontmatter(title, parent, has_children=True) + target_index.write_text(frontmatter, encoding='utf-8') + print(f"Generated: {target_index}") + +if __name__ == "__main__": + export_docs() diff --git a/scripts/docs_exporter/fix_markdown_formatting.py b/scripts/docs_exporter/fix_markdown_formatting.py new file mode 100755 index 00000000000..b8dbc3a59b3 --- /dev/null +++ b/scripts/docs_exporter/fix_markdown_formatting.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python3 +""" +Comprehensive markdown formatting script for docs/user/ppl/**/*.md +- Adds proper spacing before code blocks and tables for Jekyll compatibility +- Adds double spaces after headers and list items for proper line breaks +- Adds blank lines after lists end +""" + +import re +from pathlib import Path +from typing import List + +def fix_markdown_formatting(content: str) -> str: + """Fix markdown formatting by adding proper spacing and line breaks.""" + lines = content.split('\n') + fixed_lines: List[str] = [] + in_code_block = False + in_table = False + + for i, line in enumerate(lines): + # Check if current line is a code block start/end + is_code_block_marker = line.startswith('```') + + # Check if current line is any table line (including separator rows) + is_table_line = line.strip().startswith('|') and line.strip().endswith('|') + + # Check if it's a data row (not a separator) + is_table_separator = re.match(r'^\s*\|[\s\-\|:]*\|\s*$', line) + is_table_row = is_table_line and not is_table_separator + + # Get previous and next lines for context + prev_line = lines[i-1] if i > 0 else '' + next_line = lines[i+1] if i+1 < len(lines) else '' + + # Convert blank lines after code blocks/tables to double-space lines + if not line.strip() and prev_line.strip(): + prev_is_code_end = prev_line.startswith('```') + prev_is_table_end = (prev_line.strip().startswith('|') and prev_line.strip().endswith('|') and + not re.match(r'^\s*\|[\s\-\|:]*\|\s*$', prev_line)) + if prev_is_code_end or (prev_is_table_end and not in_table): + line = ' ' + + # Add spacing BEFORE code blocks and tables (check BEFORE updating in_table) + if not in_code_block: + # Opening code block + if is_code_block_marker: + if fixed_lines and fixed_lines[-1].strip(): + fixed_lines.append(' ') + elif fixed_lines and not fixed_lines[-1].strip(): + fixed_lines[-1] = ' ' + + # Starting table (first table row) + elif is_table_row and not in_table: + if fixed_lines and fixed_lines[-1].strip(): + fixed_lines.append(' ') + elif fixed_lines and not fixed_lines[-1].strip(): + fixed_lines[-1] = ' ' + + # Detect table start/end (AFTER spacing logic) - use is_table_line to include separators + if not in_code_block: + if is_table_line and not in_table: + in_table = True + elif not is_table_line and in_table: + in_table = False + + # Process line for double spaces (only outside code blocks) + if not in_code_block and not is_code_block_marker and line.strip(): + # Add double spaces to headers and list items if not already present + is_header = line.startswith('#') + is_list = (line.strip().startswith('* ') or + line.strip().startswith('- ') or + line.strip().startswith('+ ') or + re.match(r'^\s*\d+\.\s', line)) + + if (is_header or is_list) and not line.endswith(' '): + line = line + ' ' + + fixed_lines.append(line) + + # Update code block state AFTER processing the line + if is_code_block_marker: + in_code_block = not in_code_block + + # Add spacing AFTER code blocks and tables + if not in_code_block: + # Closing code block + if is_code_block_marker and next_line.strip(): + if i+1 < len(lines) and not lines[i+1].strip(): + pass # Will convert blank line when we reach it + else: + fixed_lines.append(' ') + + # Ending table (last table row before non-table content) + elif is_table_row and in_table and next_line.strip() and not (next_line.strip().startswith('|') and next_line.strip().endswith('|')): + if i+1 < len(lines) and not lines[i+1].strip(): + pass # Will convert blank line when we reach it + else: + fixed_lines.append(' ') + + # Add blank line after list ends (only outside code blocks and tables) + if not in_code_block and not is_code_block_marker and not in_table: + current_is_list = (line.strip().startswith('* ') or + line.strip().startswith('- ') or + line.strip().startswith('+ ') or + re.match(r'^\s*\d+\.\s', line)) + next_is_not_list = (next_line.strip() and + not next_line.strip().startswith('* ') and + not next_line.strip().startswith('- ') and + not next_line.strip().startswith('+') and + not re.match(r'^\s*\d+\.\s', next_line) and + not next_line.strip().startswith(' ')) # Not indented continuation + + # Add blank line after list ends (with double spaces) + if current_is_list and next_is_not_list: + fixed_lines.append(' ') + + return '\n'.join(fixed_lines) + +def process_file(file_path: Path) -> bool: + """Process a single markdown file.""" + content = file_path.read_text(encoding='utf-8') + + fixed_content = fix_markdown_formatting(content) + + # Only write if content changed + if fixed_content != content: + file_path.write_text(fixed_content, encoding='utf-8') + print(f"Fixed: {file_path}") + return True + return False + +def main() -> None: + """Fix all markdown files in docs/user/ppl/""" + # Get the directory where this script is located + script_dir = Path(__file__).parent + + # Define path relative to the script location + source_dir = script_dir / "../../docs/user/ppl" + + if not source_dir.exists(): + print(f"Source directory {source_dir} not found") + return + + print("Fixing markdown formatting:") + print("- Adding double-space lines above and below code blocks") + print("- Adding double-space lines above and below tables") + print("- Adding double spaces after headers and list items") + print("- Adding blank lines after lists") + print() + + md_files = list(source_dir.glob('**/*.md')) + fixed_count = 0 + + for md_file in md_files: + if process_file(md_file): + fixed_count += 1 + + print(f"\nProcessed {len(md_files)} files, fixed {fixed_count} files") + +if __name__ == "__main__": + main() From 4246f39a2bba5786308d222e6050e0ceed0b4e3f Mon Sep 17 00:00:00 2001 From: Kyle Hounslow Date: Tue, 9 Dec 2025 12:37:09 -0800 Subject: [PATCH 2/3] fix: link-checker errors Signed-off-by: Kyle Hounslow --- README.md | 2 +- docs/dev/intro-v3-engine.md | 6 +++--- docs/user/ppl/admin/connectors/s3glue_connector.md | 4 ++-- docs/user/ppl/admin/connectors/security_lake_connector.md | 4 ++-- docs/user/ppl/functions/relevance.md | 2 +- docs/user/ppl/general/datatypes.md | 4 ++-- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index d139da3430e..4a7e1e5ec9e 100644 --- a/README.md +++ b/README.md @@ -88,7 +88,7 @@ Recently we have been actively improving our query engine primarily for better c ## Documentation -Please refer to the [SQL Language Reference Manual](./docs/user/index.rst), [Piped Processing Language (PPL) Reference Manual](./docs/user/ppl/index.rst), [OpenSearch SQL/PPL Engine Development Manual](./docs/dev/index.md) and [Technical Documentation](https://opensearch.org/docs/latest/search-plugins/sql/index/) for detailed information on installing and configuring plugin. +Please refer to the [SQL Language Reference Manual](./docs/user/index.rst), [Piped Processing Language (PPL) Reference Manual](./docs/user/ppl/index.md), [OpenSearch SQL/PPL Engine Development Manual](./docs/dev/index.md) and [Technical Documentation](https://opensearch.org/docs/latest/search-plugins/sql/index/) for detailed information on installing and configuring plugin. ## Forum diff --git a/docs/dev/intro-v3-engine.md b/docs/dev/intro-v3-engine.md index 43b90bda204..fd73cd5c8e1 100644 --- a/docs/dev/intro-v3-engine.md +++ b/docs/dev/intro-v3-engine.md @@ -26,9 +26,9 @@ Find more details in [V3 Architecture](./intro-v3-architecture.md). In the initial release of the V3 engine (3.0.0), the main new features focus on enhancing the PPL language while maintaining maximum compatibility with V2 behavior. -* **[Join](../user/ppl/cmd/join.rst) Command** -* **[Lookup](../user/ppl/cmd/lookup.rst) Command** -* **[Subquery](../user/ppl/cmd/subquery.rst) Command** +* **[Join](../user/ppl/cmd/join.md) Command** +* **[Lookup](../user/ppl/cmd/lookup.md) Command** +* **[Subquery](../user/ppl/cmd/subquery.md) Command** V3 (Calcite integration) engine is enabled by default in 3.3.0. diff --git a/docs/user/ppl/admin/connectors/s3glue_connector.md b/docs/user/ppl/admin/connectors/s3glue_connector.md index e05edbaa308..4b2d75f1042 100644 --- a/docs/user/ppl/admin/connectors/s3glue_connector.md +++ b/docs/user/ppl/admin/connectors/s3glue_connector.md @@ -8,7 +8,7 @@ This page covers s3Glue datasource configuration and also how to query and s3Glu * `EMRServerless Spark Execution Engine Config Setting`: Since we execute s3Glue queries on top of spark execution engine, we require this configuration. - More details: [ExecutionEngine Config](../../../interfaces/asyncqueryinterface.md#id2) + More details: [ExecutionEngine Config](../../../interfaces/asyncqueryinterface.rst#id2) * `S3`: This is where the data lies. * `Glue` Metadata store: Glue takes care of table metadata. * `Opensearch IndexStore`: Index for s3 data lies in opensearch and also acts as temporary buffer for query results. @@ -73,5 +73,5 @@ Sample Queries * Create Covering Index Query: `create index clientip_year on my_glue.default.http_logs (clientip, year) WITH (auto_refresh=true)` * Create Skipping Index: `create skipping index on mys3.default.http_logs (status VALUE_SET)` -These queries would work only top of async queries. Documentation: [Async Query APIs](../../../interfaces/asyncqueryinterface.md) +These queries would work only top of async queries. Documentation: [Async Query APIs](../../../interfaces/asyncqueryinterface.rst) Documentation for Index Queries: https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md \ No newline at end of file diff --git a/docs/user/ppl/admin/connectors/security_lake_connector.md b/docs/user/ppl/admin/connectors/security_lake_connector.md index a9b27cf7e21..d33f1f49702 100644 --- a/docs/user/ppl/admin/connectors/security_lake_connector.md +++ b/docs/user/ppl/admin/connectors/security_lake_connector.md @@ -7,7 +7,7 @@ Security Lake connector provides a way to query Security Lake tables. * `EMRServerless Spark Execution Engine Config Setting`: Since we execute s3Glue queries on top of spark execution engine, we require this configuration. - More details: [ExecutionEngine Config](../../../interfaces/asyncqueryinterface.md#id2) + More details: [ExecutionEngine Config](../../../interfaces/asyncqueryinterface.rst#id2) * `S3`: This is where the data lies. * `Glue`: Metadata store: Glue takes care of table metadata. * `Lake Formation`: AWS service that performs authorization on Security Lake tables @@ -59,5 +59,5 @@ Sample Queries * Select Query : `select * from mysl.amazon_security_lake_glue_db_eu_west_1.amazon_security_lake_table_eu_west_1_vpc_flow_2_0 limit 1` * Create Covering Index Query: `create index srcip_time on mysl.amazon_security_lake_glue_db_eu_west_1.amazon_security_lake_table_eu_west_1_vpc_flow_2_0 (src_endpoint.ip, time) WITH (auto_refresh=true)` -These queries would work only top of async queries. Documentation: [Async Query APIs](../../../interfaces/asyncqueryinterface.md) +These queries would work only top of async queries. Documentation: [Async Query APIs](../../../interfaces/asyncqueryinterface.rst) Documentation for Index Queries: https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md \ No newline at end of file diff --git a/docs/user/ppl/functions/relevance.md b/docs/user/ppl/functions/relevance.md index 2db7caf9a2c..a40a3cd7644 100644 --- a/docs/user/ppl/functions/relevance.md +++ b/docs/user/ppl/functions/relevance.md @@ -502,4 +502,4 @@ fetched rows / total rows = 2/2 ### Limitations -The relevance functions are available to execute only in OpenSearch DSL but not in memory as of now, so the relevance search might fail for queries that are too complex to translate into DSL if the relevance function is following after a complex PPL query. To make your queries always work-able, it is recommended to place the relevance commands as close to the search command as possible, to ensure the relevance functions are eligible to push down. For example, a complex query like `search source = people | rename firstname as name | dedup account_number | fields name, account_number, balance, employer | where match(employer, 'Open Search') | stats count() by city` could fail because it is difficult to translate to DSL, but it would be better if we rewrite it to an equivalent query as `search source = people | where match(employer, 'Open Search') | rename firstname as name | dedup account_number | fields name, account_number, balance, employer | stats count() by city` by moving the where command with relevance function to the second command right after the search command, and the relevance would be optimized and executed smoothly in OpenSearch DSL. See [Optimization](../../optimization/optimization.md) to get more details about the query engine optimization. \ No newline at end of file +The relevance functions are available to execute only in OpenSearch DSL but not in memory as of now, so the relevance search might fail for queries that are too complex to translate into DSL if the relevance function is following after a complex PPL query. To make your queries always work-able, it is recommended to place the relevance commands as close to the search command as possible, to ensure the relevance functions are eligible to push down. For example, a complex query like `search source = people | rename firstname as name | dedup account_number | fields name, account_number, balance, employer | where match(employer, 'Open Search') | stats count() by city` could fail because it is difficult to translate to DSL, but it would be better if we rewrite it to an equivalent query as `search source = people | where match(employer, 'Open Search') | rename firstname as name | dedup account_number | fields name, account_number, balance, employer | stats count() by city` by moving the where command with relevance function to the second command right after the search command, and the relevance would be optimized and executed smoothly in OpenSearch DSL. See [Optimization](../../optimization/optimization.rst) to get more details about the query engine optimization. \ No newline at end of file diff --git a/docs/user/ppl/general/datatypes.md b/docs/user/ppl/general/datatypes.md index 27fc71155ee..b9e1cac1249 100644 --- a/docs/user/ppl/general/datatypes.md +++ b/docs/user/ppl/general/datatypes.md @@ -58,7 +58,7 @@ Notes: Not all the PPL Type has correspond OpenSearch Type. e.g. data and time. Numeric values ranged from -2147483648 to +2147483647 are recognized as integer with type name `int`. For others outside the range, `bigint` integer will be the data type after parsed. ## Date and Time Data Types -The date and time data types are the types that represent temporal values and PPL plugin supports types including DATE, TIME, TIMESTAMP and INTERVAL. By default, the OpenSearch DSL uses date type as the only date and time related type, which has contained all information about an absolute time point. To integrate with PPL language, each of the types other than timestamp is holding part of temporal or timezone information, and the usage to explicitly clarify the date and time types is reflected in the datetime functions (see [Functions](functions.md) for details), where some functions might have restrictions in the input argument type. +The date and time data types are the types that represent temporal values and PPL plugin supports types including DATE, TIME, TIMESTAMP and INTERVAL. By default, the OpenSearch DSL uses date type as the only date and time related type, which has contained all information about an absolute time point. To integrate with PPL language, each of the types other than timestamp is holding part of temporal or timezone information, and the usage to explicitly clarify the date and time types is reflected in the datetime functions (see [Functions](../functions/datetime.md) for details), where some functions might have restrictions in the input argument type. ### Date Date represents the calendar date regardless of the time zone. A given date value represents a 24-hour period, or say a day, but this period varies in different timezones and might have flexible hours during Daylight Savings Time programs. Besides, the date type does not contain time information as well. The supported range is '1000-01-01' to '9999-12-31'. @@ -91,7 +91,7 @@ Interval data type represents a temporal duration or a period. The syntax is as | --- | --- | | Interval | INTERVAL expr unit | -The expr is any expression that can be iterated to a quantity value eventually, see [Expressions](expressions.md) for details. The unit represents the unit for interpreting the quantity, including MICROSECOND, SECOND, MINUTE, HOUR, DAY, WEEK, MONTH, QUARTER and YEAR.The INTERVAL keyword and the unit specifier are not case sensitive. Note that there are two classes of intervals. Year-week intervals can store years, quarters, months and weeks. Day-time intervals can store days, hours, minutes, seconds and microseconds. Year-week intervals are comparable only with another year-week intervals. These two types of intervals can only comparable with the same type of themselves. +The expr is any expression that can be iterated to a quantity value eventually, see [Expressions](../functions/expressions.md) for details. The unit represents the unit for interpreting the quantity, including MICROSECOND, SECOND, MINUTE, HOUR, DAY, WEEK, MONTH, QUARTER and YEAR.The INTERVAL keyword and the unit specifier are not case sensitive. Note that there are two classes of intervals. Year-week intervals can store years, quarters, months and weeks. Day-time intervals can store days, hours, minutes, seconds and microseconds. Year-week intervals are comparable only with another year-week intervals. These two types of intervals can only comparable with the same type of themselves. ### Conversion between date and time types Basically the date and time types except interval can be converted to each other, but might suffer some alteration of the value or some information loss, for example extracting the time value from a timestamp value, or convert a date value to a timestamp value and so forth. Here lists the summary of the conversion rules that PPL plugin supports for each of the types: From 90a31d9e8b381a1c61d3bc0bc2882b5e364a485c Mon Sep 17 00:00:00 2001 From: Kyle Hounslow Date: Tue, 9 Dec 2025 12:49:36 -0800 Subject: [PATCH 3/3] fix: replace test.com link in UT Signed-off-by: Kyle Hounslow --- .../sql/prometheus/storage/PrometheusStorageFactoryTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/prometheus/src/test/java/org/opensearch/sql/prometheus/storage/PrometheusStorageFactoryTest.java b/prometheus/src/test/java/org/opensearch/sql/prometheus/storage/PrometheusStorageFactoryTest.java index 7b1e2dec0f7..3f465b96d2b 100644 --- a/prometheus/src/test/java/org/opensearch/sql/prometheus/storage/PrometheusStorageFactoryTest.java +++ b/prometheus/src/test/java/org/opensearch/sql/prometheus/storage/PrometheusStorageFactoryTest.java @@ -130,7 +130,7 @@ void testGetStorageEngineWithWrongAuthType() { .thenReturn(Collections.emptyList()); PrometheusStorageFactory prometheusStorageFactory = new PrometheusStorageFactory(settings); HashMap properties = new HashMap<>(); - properties.put("prometheus.uri", "https://test.com"); + properties.put("prometheus.uri", "https://opensearch.org"); properties.put("prometheus.auth.type", "random"); properties.put("prometheus.auth.region", "us-east-1"); properties.put("prometheus.auth.secret_key", "accessKey"); @@ -150,7 +150,7 @@ void testGetStorageEngineWithNONEAuthType() { .thenReturn(Collections.emptyList()); PrometheusStorageFactory prometheusStorageFactory = new PrometheusStorageFactory(settings); HashMap properties = new HashMap<>(); - properties.put("prometheus.uri", "https://test.com"); + properties.put("prometheus.uri", "https://opensearch.org"); StorageEngine storageEngine = prometheusStorageFactory.getStorageEngine(properties); Assertions.assertTrue(storageEngine instanceof PrometheusStorageEngine); }